536 files changed, 26143 insertions, 5193 deletions
diff --git a/.github/workflows/build-ci-container-windows.yml b/.github/workflows/build-ci-container-windows.yml
index 167e7cf..14c349b 100644
--- a/.github/workflows/build-ci-container-windows.yml
+++ b/.github/workflows/build-ci-container-windows.yml
@@ -44,7 +44,7 @@ jobs:
         run: |
           docker save  ${{ steps.vars.outputs.container-name-tag }} >  ${{ steps.vars.outputs.container-filename }}
       - name: Upload container image
-        uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # v4.6.0
+        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
         with:
           name: container
           path: ${{ steps.vars.outputs.container-filename }}
diff --git a/.github/workflows/build-ci-container.yml b/.github/workflows/build-ci-container.yml
index 67f35fd..01f1b8d 100644
--- a/.github/workflows/build-ci-container.yml
+++ b/.github/workflows/build-ci-container.yml
@@ -64,7 +64,7 @@ jobs:
           podman save ${{ steps.vars.outputs.container-name-agent-tag }} > ${{ steps.vars.outputs.container-agent-filename }}
 
       - name: Upload container image
-        uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # v4.6.0
+        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
         with:
           name: container-${{ matrix.arch }}
           path: "*.tar"
diff --git a/.github/workflows/build-metrics-container.yml b/.github/workflows/build-metrics-container.yml
index cadcaa9..69b5715 100644
--- a/.github/workflows/build-metrics-container.yml
+++ b/.github/workflows/build-metrics-container.yml
@@ -49,7 +49,7 @@ jobs:
         run: |
           podman save  ${{ steps.vars.outputs.container-name-tag }} >  ${{ steps.vars.outputs.container-filename }}
       - name: Upload Container Image
-        uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # v4.6.0
+        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
         with:
           name: container
           path: ${{ steps.vars.outputs.container-filename }}
diff --git a/.github/workflows/check-ci.yml b/.github/workflows/check-ci.yml
index 7e8c156..f18a69c 100644
--- a/.github/workflows/check-ci.yml
+++ b/.github/workflows/check-ci.yml
@@ -26,7 +26,7 @@ jobs:
         with:
           sparse-checkout: .ci
       - name: Setup Python
-        uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0
+        uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
         with:
           python-version: 3.13
           cache: 'pip'
diff --git a/.github/workflows/ci-post-commit-analyzer.yml b/.github/workflows/ci-post-commit-analyzer.yml
index 7d37b90..49cf410 100644
--- a/.github/workflows/ci-post-commit-analyzer.yml
+++ b/.github/workflows/ci-post-commit-analyzer.yml
@@ -44,7 +44,7 @@ jobs:
         uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
 
       - name: Setup ccache
-        uses: hendrikmuhs/ccache-action@a1209f81afb8c005c13b4296c32e363431bffea5 # v1.2.17
+        uses: hendrikmuhs/ccache-action@bfa03e1de4d7f7c3e80ad9109feedd05c4f5a716 # v1.2.19
         with:
           # A full build of llvm, clang, lld, and lldb takes about 250MB
           # of ccache space. There's not much reason to have more than this,
@@ -87,7 +87,7 @@ jobs:
           scan-build --generate-index-only build/analyzer-results
 
       - name: Upload Results
-        uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8 #v4.3.0
+        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
         if: always()
         with:
           name: analyzer-results
diff --git a/.github/workflows/commit-access-review.yml b/.github/workflows/commit-access-review.yml
index a7be81b..734dc21 100644
--- a/.github/workflows/commit-access-review.yml
+++ b/.github/workflows/commit-access-review.yml
@@ -28,7 +28,7 @@ jobs:
           python3 .github/workflows/commit-access-review.py $GITHUB_TOKEN
 
       - name: Upload Triage List
-        uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8 #v4.3.0
+        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
         with:
           name: triagers
           path: triagers.log
diff --git a/.github/workflows/containers/github-action-ci/Dockerfile b/.github/workflows/containers/github-action-ci/Dockerfile
index 1d3f5f9..dc0c9ca 100644
--- a/.github/workflows/containers/github-action-ci/Dockerfile
+++ b/.github/workflows/containers/github-action-ci/Dockerfile
@@ -62,6 +62,7 @@ RUN apt-get update && \
     # Having a symlink from python to python3 enables code sharing between
     # the Linux and Windows pipelines.
     python3-pip \
+    python3-venv \
     file \
     tzdata \
     python-is-python3 && \
diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
index 8cdd39c..b5f3413 100644
--- a/.github/workflows/docs.yml
+++ b/.github/workflows/docs.yml
@@ -95,9 +95,9 @@ jobs:
             workflow:
               - '.github/workflows/docs.yml'
       - name: Setup Python env
-        uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0
+        uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
         with:
-          python-version: '3.11'
+          python-version: '3.13'
           cache: 'pip'
           cache-dependency-path: 'llvm/docs/requirements-hashed.txt'
       - name: Install python dependencies
@@ -209,7 +209,7 @@ jobs:
           mkdir built-docs/flang
           cp -r flang-build/docs/* built-docs/flang/
       - name: Upload docs
-        uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # v4.6.0
+        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
         with:
           name: docs-output
           path: built-docs/
diff --git a/.github/workflows/email-check.yaml b/.github/workflows/email-check.yaml
index 9390fba..981c6fa 100644
--- a/.github/workflows/email-check.yaml
+++ b/.github/workflows/email-check.yaml
@@ -39,7 +39,7 @@ jobs:
           [{"body" : "$COMMENT"}]
           EOF
 
-      - uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8 #v4.3.0
+      - uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
         if: always()
         with:
           name: workflow-args
diff --git a/.github/workflows/gha-codeql.yml b/.github/workflows/gha-codeql.yml
index efb8143..63388eb 100644
--- a/.github/workflows/gha-codeql.yml
+++ b/.github/workflows/gha-codeql.yml
@@ -29,9 +29,9 @@ jobs:
           sparse-checkout: |
             .github/
       - name: Initialize CodeQL
-        uses: github/codeql-action/init@192325c86100d080feab897ff886c34abd4c83a3 # v3.30.3
+        uses: github/codeql-action/init@303c0aef88fc2fe5ff6d63d3b1596bfd83dfa1f9 # v3.30.4
         with:
           languages: actions
           queries: security-extended
       - name: Perform CodeQL Analysis
-        uses: github/codeql-action/analyze@192325c86100d080feab897ff886c34abd4c83a3 # v3.30.3
+        uses: github/codeql-action/analyze@303c0aef88fc2fe5ff6d63d3b1596bfd83dfa1f9 # v3.30.4
diff --git a/.github/workflows/hlsl-test-all.yaml b/.github/workflows/hlsl-test-all.yaml
index 72cbbe2..dcb8523 100644
--- a/.github/workflows/hlsl-test-all.yaml
+++ b/.github/workflows/hlsl-test-all.yaml
@@ -80,7 +80,7 @@ jobs:
             ninja check-hlsl-unit
             ninja ${{ inputs.TestTarget }}
       - name: Publish Test Results
-        uses: EnricoMi/publish-unit-test-result-action/macos@170bf24d20d201b842d7a52403b73ed297e6645b # v2
+        uses: EnricoMi/publish-unit-test-result-action/macos@3a74b2957438d0b6e2e61d67b05318aa25c9e6c6 # v2.20.0
         if: always() && runner.os == 'macOS'
         with:
           comment_mode: off
diff --git a/.github/workflows/issue-write.yml b/.github/workflows/issue-write.yml
index db9389b..26cd60c 100644
--- a/.github/workflows/issue-write.yml
+++ b/.github/workflows/issue-write.yml
@@ -40,7 +40,7 @@ jobs:
 
       - name: 'Comment on PR'
         if: steps.download-artifact.outputs.artifact-id != ''
-        uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1
+        uses: actions/github-script@f28e40c7f34bde8b3046d885e986cb6290c5673b # v7.1.0
         with:
           github-token: ${{ secrets.GITHUB_TOKEN }}
           script: |
diff --git a/.github/workflows/libc-fullbuild-tests.yml b/.github/workflows/libc-fullbuild-tests.yml
index 8967cd0..3a048ae 100644
--- a/.github/workflows/libc-fullbuild-tests.yml
+++ b/.github/workflows/libc-fullbuild-tests.yml
@@ -61,7 +61,7 @@ jobs:
     # Do not use direct GHAC access even though it is supported by sccache. GHAC rejects
     # frequent small object writes.
     - name: Setup ccache
-      uses: hendrikmuhs/ccache-action@a1209f81afb8c005c13b4296c32e363431bffea5 # v1.2.17
+      uses: hendrikmuhs/ccache-action@bfa03e1de4d7f7c3e80ad9109feedd05c4f5a716 # v1.2.19
       with:
         max-size: 1G
         key: libc_fullbuild_${{ matrix.c_compiler }}
diff --git a/.github/workflows/libc-overlay-tests.yml b/.github/workflows/libc-overlay-tests.yml
index 7154946..df9a20d 100644
--- a/.github/workflows/libc-overlay-tests.yml
+++ b/.github/workflows/libc-overlay-tests.yml
@@ -51,7 +51,7 @@ jobs:
     # Do not use direct GHAC access even though it is supported by sccache. GHAC rejects
     # frequent small object writes.
     - name: Setup ccache
-      uses: hendrikmuhs/ccache-action@a1209f81afb8c005c13b4296c32e363431bffea5 # v1.2.17
+      uses: hendrikmuhs/ccache-action@bfa03e1de4d7f7c3e80ad9109feedd05c4f5a716 # v1.2.19
       with:
         max-size: 1G
         key: libc_overlay_build_${{ matrix.os }}_${{ matrix.compiler.c_compiler }}
diff --git a/.github/workflows/libclang-abi-tests.yml b/.github/workflows/libclang-abi-tests.yml
index d53a2f3..5ccf976 100644
--- a/.github/workflows/libclang-abi-tests.yml
+++ b/.github/workflows/libclang-abi-tests.yml
@@ -131,7 +131,7 @@ jobs:
             sed -i 's/LLVM_[0-9]\+/LLVM_NOVERSION/' $lib-${{ matrix.ref }}.abi
           done
       - name: Upload ABI file
-        uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # 4.6.0
+        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # 4.6.2
         with:
           name: ${{ matrix.name }}
           path: '*${{ matrix.ref }}.abi'
@@ -165,7 +165,7 @@ jobs:
           done
       - name: Upload ABI Comparison
         if: always()
-        uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # 4.6.0
+        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # 4.6.2
         with:
           name: compat-report-${{ github.sha }}
           path: compat_reports/
diff --git a/.github/workflows/libclang-python-tests.yml b/.github/workflows/libclang-python-tests.yml
index e168928..8fb8cec 100644
--- a/.github/workflows/libclang-python-tests.yml
+++ b/.github/workflows/libclang-python-tests.yml
@@ -34,11 +34,11 @@ jobs:
     steps:
       - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
       - name: Setup Python
-        uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0
+        uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
         with:
           python-version: ${{ matrix.python-version }}
       - name: Setup ccache
-        uses: hendrikmuhs/ccache-action@a1209f81afb8c005c13b4296c32e363431bffea5 # v1.2.17
+        uses: hendrikmuhs/ccache-action@bfa03e1de4d7f7c3e80ad9109feedd05c4f5a716 # v1.2.19
         with:
           max-size: 2G
           key: spirv-ubuntu-24.04
diff --git a/.github/workflows/libcxx-build-and-test.yaml b/.github/workflows/libcxx-build-and-test.yaml
index 2e6ff7f..5fe2ffb 100644
--- a/.github/workflows/libcxx-build-and-test.yaml
+++ b/.github/workflows/libcxx-build-and-test.yaml
@@ -60,7 +60,7 @@ jobs:
         env:
           CC: ${{ matrix.cc }}
           CXX: ${{ matrix.cxx }}
-      - uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8 # v4.3.0
+      - uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
         if: always()
         with:
           name: ${{ matrix.config }}-${{ matrix.cxx }}-results
@@ -105,7 +105,7 @@ jobs:
         env:
           CC: ${{ matrix.cc }}
           CXX: ${{ matrix.cxx }}
-      - uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8 # v4.3.0
+      - uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
         if: always()  # Upload artifacts even if the build or test suite fails
         with:
           name: ${{ matrix.config }}-${{ matrix.cxx }}-results
@@ -169,7 +169,7 @@ jobs:
         env:
           CC: clang-22
           CXX: clang++-22
-      - uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8 # v4.3.0
+      - uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
         if: always()
         with:
           name: ${{ matrix.config }}-results
@@ -223,7 +223,7 @@ jobs:
           source .venv/bin/activate
           python -m pip install psutil
           bash libcxx/utils/ci/run-buildbot ${{ matrix.config }}
-      - uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8 # v4.3.0
+      - uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
         if: always()  # Upload artifacts even if the build or test suite fails
         with:
           name: macos-${{ matrix.config }}-results
diff --git a/.github/workflows/libcxx-build-containers.yml b/.github/workflows/libcxx-build-containers.yml
index cbaa8e0..312cb47 100644
--- a/.github/workflows/libcxx-build-containers.yml
+++ b/.github/workflows/libcxx-build-containers.yml
@@ -55,7 +55,7 @@ jobs:
         TAG: ${{ github.sha }}
 
     - name: Log in to GitHub Container Registry
-      uses: docker/login-action@9780b0c442fbb1117ed29e0efdff1e18412f7567 # v3.3.0
+      uses: docker/login-action@184bdaa0721073962dff0199f1fb9940f07167d1 # v3.5.0
       with:
         registry: ghcr.io
         username: ${{ github.actor }}
diff --git a/.github/workflows/libcxx-check-generated-files.yml b/.github/workflows/libcxx-check-generated-files.yml
index f338bd6..d34b6a7 100644
--- a/.github/workflows/libcxx-check-generated-files.yml
+++ b/.github/workflows/libcxx-check-generated-files.yml
@@ -15,7 +15,7 @@ jobs:
         uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
 
       - name: Install dependencies
-        uses: aminya/setup-cpp@17c11551771948abc5752bbf3183482567c7caf0 # v1.1.1
+        uses: aminya/setup-cpp@a276e6e3d1db9160db5edc458e99a30d3b109949 # v1.7.1
         with:
           clangformat: 17.0.1
           ninja: true
diff --git a/.github/workflows/libcxx-run-benchmarks.yml b/.github/workflows/libcxx-run-benchmarks.yml
index 17a97df..0379a0a 100644
--- a/.github/workflows/libcxx-run-benchmarks.yml
+++ b/.github/workflows/libcxx-run-benchmarks.yml
@@ -35,7 +35,7 @@ jobs:
     steps:
       - uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0
         with:
-          python-version: '3.10'
+          python-version: '3.13'
 
       - name: Extract information from the PR
         id: vars
diff --git a/.github/workflows/llvm-bugs.yml b/.github/workflows/llvm-bugs.yml
index 5470662..7d42abf 100644
--- a/.github/workflows/llvm-bugs.yml
+++ b/.github/workflows/llvm-bugs.yml
@@ -14,7 +14,7 @@ jobs:
     runs-on: ubuntu-24.04
     if: github.repository == 'llvm/llvm-project'
     steps:
-      - uses: actions/setup-node@1d0ff469b7ec7b3cb9d8673fde0c81c44821de2a # v4.2.0
+      - uses: actions/setup-node@49933ea5288caeca8642d1e84afbd3f7d6820020 # v4.4.0
         with:
           node-version: 18
           check-latest: true
diff --git a/.github/workflows/llvm-tests.yml b/.github/workflows/llvm-tests.yml
index ea80e22..c4701c7 100644
--- a/.github/workflows/llvm-tests.yml
+++ b/.github/workflows/llvm-tests.yml
@@ -128,14 +128,14 @@ jobs:
           # Remove symbol versioning from dumps, so we can compare across major versions.
           sed -i 's/LLVM_${{ matrix.llvm_version_major }}/LLVM_NOVERSION/' ${{ matrix.ref }}.abi
       - name: Upload ABI file
-        uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # 4.6.0
+        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # 4.6.2
         with:
           name: ${{ matrix.name }}
           path: ${{ matrix.ref }}.abi
 
       - name: Upload symbol list file
         if: matrix.name == 'build-baseline'
-        uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # 4.6.0
+        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # 4.6.2
         with:
           name: symbol-list
           path: llvm.symbols
@@ -179,7 +179,7 @@ jobs:
           abi-compliance-checker $EXTRA_ARGS -l libLLVM.so -old build-baseline/*.abi -new build-latest/*.abi || test "${{ needs.abi-dump-setup.outputs.ABI_HEADERS }}" = "llvm-c"
       - name: Upload ABI Comparison
         if: always()
-        uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # 4.6.0
+        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # 4.6.2
         with:
           name: compat-report-${{ github.sha }}
           path: compat_reports/
diff --git a/.github/workflows/mlir-spirv-tests.yml b/.github/workflows/mlir-spirv-tests.yml
index 78952cc..5bb16c7 100644
--- a/.github/workflows/mlir-spirv-tests.yml
+++ b/.github/workflows/mlir-spirv-tests.yml
@@ -30,7 +30,7 @@ jobs:
     steps:
       - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
       - name: Setup ccache
-        uses: hendrikmuhs/ccache-action@a1209f81afb8c005c13b4296c32e363431bffea5 # v1.2.17
+        uses: hendrikmuhs/ccache-action@bfa03e1de4d7f7c3e80ad9109feedd05c4f5a716 # v1.2.19
         with:
           max-size: 2G
           key: spirv-mlir-ubuntu-24.04
diff --git a/.github/workflows/pr-code-format.yml b/.github/workflows/pr-code-format.yml
index 61c8680..1e0dc70 100644
--- a/.github/workflows/pr-code-format.yml
+++ b/.github/workflows/pr-code-format.yml
@@ -43,14 +43,14 @@ jobs:
       # of a release cycle (x.1.0) or the last version of a release cycle, or
       # if there have been relevant clang-format backports.
       - name: Install clang-format
-        uses: aminya/setup-cpp@17c11551771948abc5752bbf3183482567c7caf0 # v1.1.1
+        uses: aminya/setup-cpp@a276e6e3d1db9160db5edc458e99a30d3b109949 # v1.7.1
         with:
           clangformat: 21.1.0
 
       - name: Setup Python env
-        uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0
+        uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
         with:
-          python-version: '3.11'
+          python-version: '3.13'
           cache: 'pip'
           cache-dependency-path: 'llvm/utils/git/requirements_formatting.txt'
 
@@ -72,7 +72,7 @@ jobs:
             --end-rev HEAD \
             --changed-files "$CHANGED_FILES"
 
-      - uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8 #v4.3.0
+      - uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
         if: always()
         with:
           name: workflow-args
diff --git a/.github/workflows/pr-code-lint.yml b/.github/workflows/pr-code-lint.yml
index daefc9b..776ec4a 100644
--- a/.github/workflows/pr-code-lint.yml
+++ b/.github/workflows/pr-code-lint.yml
@@ -27,7 +27,7 @@ jobs:
       cancel-in-progress: true
     steps:
       - name: Fetch LLVM sources
-        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
+        uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4.3.0
         with:
           fetch-depth: 2
       
@@ -51,14 +51,14 @@ jobs:
       # of a release cycle (x.1.0) or the last version of a release cycle, or
       # if there have been relevant clang-format backports.
       - name: Install clang-tidy
-        uses: aminya/setup-cpp@17c11551771948abc5752bbf3183482567c7caf0 # v1.1.1
+        uses: aminya/setup-cpp@a276e6e3d1db9160db5edc458e99a30d3b109949 # v1.7.1
         with:
           clang-tidy: 21.1.0
       
       - name: Setup Python env
-        uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0
+        uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
         with:
-          python-version: '3.12'
+          python-version: '3.13'
 
       - name: Install Python dependencies
         run: python3 -m pip install -r llvm/utils/git/requirements_linting.txt
@@ -107,7 +107,7 @@ jobs:
             --changed-files "$CHANGED_FILES"
       
       - name: Upload results
-        uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8 #v4.3.0
+        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
         if: always()
         with:
           name: workflow-args
diff --git a/.github/workflows/pr-request-release-note.yml b/.github/workflows/pr-request-release-note.yml
index f0197d7..8162a89 100644
--- a/.github/workflows/pr-request-release-note.yml
+++ b/.github/workflows/pr-request-release-note.yml
@@ -41,7 +41,7 @@ jobs:
             request-release-note \
             --pr-number ${{ github.event.pull_request.number}}
 
-      - uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8 #v4.3.0
+      - uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
         if: always()
         with:
           name: workflow-args
diff --git a/.github/workflows/premerge.yaml b/.github/workflows/premerge.yaml
index 63ab4a8..a9c107e 100644
--- a/.github/workflows/premerge.yaml
+++ b/.github/workflows/premerge.yaml
@@ -76,7 +76,7 @@ jobs:
         # https://github.com/actions/upload-artifact/issues/569
         continue-on-error: true
         if: '!cancelled()'
-        uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # v4.6.0
+        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
         with:
           name: Premerge Artifacts (Linux)
           path: artifacts/
@@ -130,7 +130,7 @@ jobs:
         # https://github.com/actions/upload-artifact/issues/569
         continue-on-error: true
         if: '!cancelled()'
-        uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # v4.6.0
+        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
         with:
           name: Premerge Artifacts (Windows)
           path: artifacts/
@@ -151,7 +151,7 @@ jobs:
         with:
           fetch-depth: 2
       - name: Setup ccache
-        uses: hendrikmuhs/ccache-action@a1209f81afb8c005c13b4296c32e363431bffea5 # v1.2.17
+        uses: hendrikmuhs/ccache-action@bfa03e1de4d7f7c3e80ad9109feedd05c4f5a716 # v1.2.19
         with:
           max-size: "2000M"
       - name: Install Ninja
diff --git a/.github/workflows/release-asset-audit.yml b/.github/workflows/release-asset-audit.yml
index 6546540..8b24948 100644
--- a/.github/workflows/release-asset-audit.yml
+++ b/.github/workflows/release-asset-audit.yml
@@ -38,7 +38,7 @@ jobs:
         if: >-
           github.event_name != 'pull_request' &&
           failure()
-        uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea #v7.0.1
+        uses: actions/github-script@f28e40c7f34bde8b3046d885e986cb6290c5673b # v7.1.0
         with:
           github-token: ${{ secrets.ISSUE_SUBSCRIBER_TOKEN }}
           script: |
diff --git a/.github/workflows/release-binaries-save-stage/action.yml b/.github/workflows/release-binaries-save-stage/action.yml
index f08088c..84ccf98 100644
--- a/.github/workflows/release-binaries-save-stage/action.yml
+++ b/.github/workflows/release-binaries-save-stage/action.yml
@@ -30,14 +30,14 @@ runs:
         tar -C ${{ inputs.build-prefix }} -c build/ | zstd -T0 -c > build.tar.zst
 
     - name: Upload Stage 1 Source
-      uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8 #v4.3.0
+      uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
       with:
         name: ${{ runner.os }}-${{ runner.arch }}-${{ github.job }}-source
         path: llvm-project.tar.zst
         retention-days: 2
 
     - name: Upload Stage 1 Build Dir
-      uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8 #v4.3.0
+      uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
       with:
         name: ${{ runner.os}}-${{ runner.arch }}-${{ github.job }}-build
         path: build.tar.zst
diff --git a/.github/workflows/release-binaries-setup-stage/action.yml b/.github/workflows/release-binaries-setup-stage/action.yml
index 8f45e22..475a25f 100644
--- a/.github/workflows/release-binaries-setup-stage/action.yml
+++ b/.github/workflows/release-binaries-setup-stage/action.yml
@@ -22,7 +22,7 @@ runs:
   using: "composite"
   steps:
     - name: Install Ninja
-      uses: llvm/actions/install-ninja@22e9f909d35b50bd1181709564bfe816eaeaae81 # main
+      uses: llvm/actions/install-ninja@a1ea791b03c8e61f53a0e66f2f73db283aa0f01e # main
    
     - name: Setup Windows
       if: startsWith(runner.os, 'Windows')
diff --git a/.github/workflows/release-binaries.yml b/.github/workflows/release-binaries.yml
index 8f422a0..cba48e4 100644
--- a/.github/workflows/release-binaries.yml
+++ b/.github/workflows/release-binaries.yml
@@ -68,9 +68,9 @@ jobs:
     steps:
     # It's good practice to use setup-python, but this is also required on macos-14
     # due to https://github.com/actions/runner-images/issues/10385
-    - uses: actions/setup-python@39cd14951b08e74b54015e9e001cdefcf80e669f
+    - uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0
       with:
-        python-version: '3.12'
+        python-version: '3.13'
 
     - name: Checkout LLVM
       uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
@@ -250,7 +250,7 @@ jobs:
         release_dir=`find ${{ steps.setup-stage.outputs.build-prefix }}/build -iname 'stage2-bins'`
         mv $release_dir/${{ needs.prepare.outputs.release-binary-filename }} .
     
-    - uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8 #v4.3.0
+    - uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
       with:
         name: ${{ runner.os }}-${{ runner.arch }}-release-binary
         # Due to path differences on Windows when running in bash vs running on node,
@@ -301,7 +301,7 @@ jobs:
 
     - name: Attest Build Provenance
       id: provenance
-      uses: actions/attest-build-provenance@897ed5eab6ed058a474202017ada7f40bfa52940 # v1.0.0
+      uses: actions/attest-build-provenance@ef244123eb79f2f7a7e75d99086184180e6d0018 # v1.4.4
       with:
         subject-path: ${{ needs.prepare.outputs.release-binary-filename }}
 
@@ -310,7 +310,7 @@ jobs:
         mv ${{ steps.provenance.outputs.bundle-path }} ${{ needs.prepare.outputs.release-binary-filename }}.jsonl
 
     - name: Upload Build Provenance
-      uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808 #v4.3.3
+      uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
       with:
         name: ${{ needs.prepare.outputs.release-binary-filename }}-attestation
         path: ${{ needs.prepare.outputs.release-binary-filename }}.jsonl
diff --git a/.github/workflows/release-documentation.yml b/.github/workflows/release-documentation.yml
index 712ff18..d3d375d 100644
--- a/.github/workflows/release-documentation.yml
+++ b/.github/workflows/release-documentation.yml
@@ -37,7 +37,7 @@ jobs:
         uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
 
       - name: Setup Python env
-        uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0
+        uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
         with:
           cache: 'pip'
           cache-dependency-path: './llvm/docs/requirements.txt'
@@ -59,7 +59,7 @@ jobs:
           ./llvm/utils/release/build-docs.sh -release "${{ inputs.release-version }}" -no-doxygen
 
       - name: Create Release Notes Artifact
-        uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # 4.6.0
+        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # 4.6.2
         with:
           name: release-notes
           path: docs-build/html-export/
diff --git a/.github/workflows/release-doxygen.yml b/.github/workflows/release-doxygen.yml
index 17c6774..79e509e 100644
--- a/.github/workflows/release-doxygen.yml
+++ b/.github/workflows/release-doxygen.yml
@@ -43,7 +43,7 @@ jobs:
         uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
 
       - name: Setup Python env
-        uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0
+        uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
         with:
           cache: 'pip'
           cache-dependency-path: './llvm/docs/requirements.txt'
diff --git a/.github/workflows/release-lit.yml b/.github/workflows/release-lit.yml
index 60ec644..8b1ce04 100644
--- a/.github/workflows/release-lit.yml
+++ b/.github/workflows/release-lit.yml
@@ -45,7 +45,7 @@ jobs:
           ./llvm/utils/release/./github-upload-release.py --token "$GITHUB_TOKEN" --user ${{ github.actor }} --user-token "$USER_TOKEN" check-permissions
 
       - name: Setup Cpp
-        uses: aminya/setup-cpp@17c11551771948abc5752bbf3183482567c7caf0 # v1.1.1
+        uses: aminya/setup-cpp@a276e6e3d1db9160db5edc458e99a30d3b109949 # v1.7.1
         with:
           compiler: llvm-16.0.6
           cmake: true
@@ -66,14 +66,14 @@ jobs:
           python3 setup.py sdist bdist_wheel
 
       - name: Upload lit to test.pypi.org
-        uses: pypa/gh-action-pypi-publish@76f52bc884231f62b9a034ebfe128415bbaabdfc # v1.12.4
+        uses: pypa/gh-action-pypi-publish@ed0c53931b1dc9bd32cbe73a98c7f6766f8a527e # v1.13.0
         with:
           password: ${{ secrets.LLVM_LIT_TEST_PYPI_API_TOKEN }}
           repository-url: https://test.pypi.org/legacy/
           packages-dir: llvm/utils/lit/dist/
 
       - name: Upload lit to pypi.org
-        uses: pypa/gh-action-pypi-publish@76f52bc884231f62b9a034ebfe128415bbaabdfc # v1.12.4
+        uses: pypa/gh-action-pypi-publish@ed0c53931b1dc9bd32cbe73a98c7f6766f8a527e # v1.13.0
         with:
           password: ${{ secrets.LLVM_LIT_PYPI_API_TOKEN }}
           packages-dir: llvm/utils/lit/dist/
diff --git a/.github/workflows/release-sources.yml b/.github/workflows/release-sources.yml
index 14cc4c4..2278b96 100644
--- a/.github/workflows/release-sources.yml
+++ b/.github/workflows/release-sources.yml
@@ -92,14 +92,14 @@ jobs:
       - name: Attest Build Provenance
         if: github.event_name != 'pull_request'
         id: provenance
-        uses: actions/attest-build-provenance@897ed5eab6ed058a474202017ada7f40bfa52940 # v1.0.0
+        uses: actions/attest-build-provenance@ef244123eb79f2f7a7e75d99086184180e6d0018 # v1.4.4
         with:
           subject-path: "*.xz"
       - if: github.event_name != 'pull_request'
         run: |
           mv ${{ steps.provenance.outputs.bundle-path }} .
       - name: Create Tarball Artifacts
-        uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808 #v4.3.3
+        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
         with:
           path: |
             *.xz
diff --git a/.github/workflows/scorecard.yml b/.github/workflows/scorecard.yml
index 40db550..c07df33 100644
--- a/.github/workflows/scorecard.yml
+++ b/.github/workflows/scorecard.yml
@@ -36,7 +36,7 @@ jobs:
           persist-credentials: false
 
       - name: "Run analysis"
-        uses: ossf/scorecard-action@f49aabe0b5af0936a0987cfb85d86b75731b0186 # v2.4.1
+        uses: ossf/scorecard-action@05b42c624433fc40578a4040d5cf5e36ddca8cde # v2.4.2
         with:
           results_file: results.sarif
           results_format: sarif
@@ -49,7 +49,7 @@ jobs:
       # Upload the results as artifacts (optional). Commenting out will disable uploads of run results in SARIF
       # format to the repository Actions tab.
       - name: "Upload artifact"
-        uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # v4.6.0
+        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
         with:
           name: SARIF file
           path: results.sarif
@@ -57,6 +57,6 @@ jobs:
 
       # Upload the results to GitHub's code scanning dashboard.
       - name: "Upload to code-scanning"
-        uses: github/codeql-action/upload-sarif@80f993039571a6de66594ecaa432875a6942e8e0 # v2.20.6
+        uses: github/codeql-action/upload-sarif@b8d3b6e8af63cde30bdc382c0bc28114f4346c88 # v2.28.1
         with:
           sarif_file: results.sarif
diff --git a/.github/workflows/spirv-tests.yml b/.github/workflows/spirv-tests.yml
index 8708fb0..69374ae 100644
--- a/.github/workflows/spirv-tests.yml
+++ b/.github/workflows/spirv-tests.yml
@@ -26,7 +26,7 @@ jobs:
     steps:
       - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
       - name: Setup ccache
-        uses: hendrikmuhs/ccache-action@a1209f81afb8c005c13b4296c32e363431bffea5 # v1.2.17
+        uses: hendrikmuhs/ccache-action@bfa03e1de4d7f7c3e80ad9109feedd05c4f5a716 # v1.2.19
         with:
           max-size: 2G
           key: spirv-ubuntu-24.04
diff --git a/.github/workflows/unprivileged-download-artifact/action.yml b/.github/workflows/unprivileged-download-artifact/action.yml
index 9d8fb59..5b50d7c 100644
--- a/.github/workflows/unprivileged-download-artifact/action.yml
+++ b/.github/workflows/unprivileged-download-artifact/action.yml
@@ -27,7 +27,7 @@ outputs:
 runs:
   using: "composite"
   steps:
-    - uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea #v7.0.1
+    - uses: actions/github-script@f28e40c7f34bde8b3046d885e986cb6290c5673b # v7.1.0
       id: artifact-url
       with:
         script: |
diff --git a/bolt/lib/Rewrite/DWARFRewriter.cpp b/bolt/lib/Rewrite/DWARFRewriter.cpp
index 6752489..5c89a42 100644
--- a/bolt/lib/Rewrite/DWARFRewriter.cpp
+++ b/bolt/lib/Rewrite/DWARFRewriter.cpp
@@ -504,9 +504,7 @@ static void emitDWOBuilder(const std::string &DWOName,
     }
     emitUnit(DWODIEBuilder, *Streamer, SplitCU);
   } else {
-    for (std::unique_ptr<llvm::DWARFUnit> &CU :
-         SplitCU.getContext().dwo_compile_units())
-      emitUnit(DWODIEBuilder, *Streamer, *CU);
+    emitUnit(DWODIEBuilder, *Streamer, SplitCU);
 
     // emit debug_types sections for dwarf4
     for (DWARFUnit *CU : DWODIEBuilder.getDWARF4TUVector())
diff --git a/bolt/test/AArch64/dwarf4-dwp-aarch64.s b/bolt/test/AArch64/dwarf4-dwp-aarch64.s
new file mode 100755
index 0000000..37507e1
--- /dev/null
+++ b/bolt/test/AArch64/dwarf4-dwp-aarch64.s
@@ -0,0 +1,407 @@
+## This test checks updating debuginfo via dwarf4 dwp file
+# RUN: rm -rf %t && mkdir -p %t && cd %t
+# RUN: split-file %s %t
+# RUN: llvm-mc -filetype=obj -triple aarch64-unknown-unknown --split-dwarf-file=main.exe-main.dwo %t/main.s -o %t/main.o
+# RUN: llvm-mc -filetype=obj -triple aarch64-unknown-unknown --split-dwarf-file=main.exe-callee.dwo %t/callee.s -o %t/callee.o
+# RUN: %clangxx %cxxflags -gdwarf-4 -gsplit-dwarf=split -Wl,-e,main %t/main.o %t/callee.o -o main.exe
+# RUN: llvm-dwp -e %t/main.exe -o %t/main.exe.dwp
+# RUN: llvm-bolt %t/main.exe -o %t/main.exe.bolt -update-debug-sections  2>&1 | FileCheck %s
+
+# CHECK-NOT: Assertion
+
+#--- main.s
+	.file	"main.cpp"
+	.globl	main                            // -- Begin function main
+	.type	main,@function
+main:                                   // @main
+.Lfunc_begin0:
+	.file	1 "." "main.cpp"
+	.loc	1 2 0                           // main.cpp:2:0
+	.loc	1 2 21 prologue_end             // main.cpp:2:21
+	.loc	1 2 14 epilogue_begin is_stmt 0 // main.cpp:2:14
+	ret
+.Lfunc_end0:
+	.size	main, .Lfunc_end0-main
+	.section	.debug_abbrev,"",@progbits
+	.byte	1                               // Abbreviation Code
+	.byte	17                              // DW_TAG_compile_unit
+	.byte	0                               // DW_CHILDREN_no
+	.byte	16                              // DW_AT_stmt_list
+	.byte	23                              // DW_FORM_sec_offset
+	.byte	27                              // DW_AT_comp_dir
+	.byte	14                              // DW_FORM_strp
+	.ascii	"\264B"                         // DW_AT_GNU_pubnames
+	.byte	25                              // DW_FORM_flag_present
+	.ascii	"\260B"                         // DW_AT_GNU_dwo_name
+	.byte	14                              // DW_FORM_strp
+	.ascii	"\261B"                         // DW_AT_GNU_dwo_id
+	.byte	7                               // DW_FORM_data8
+	.byte	17                              // DW_AT_low_pc
+	.byte	1                               // DW_FORM_addr
+	.byte	18                              // DW_AT_high_pc
+	.byte	6                               // DW_FORM_data4
+	.ascii	"\263B"                         // DW_AT_GNU_addr_base
+	.byte	23                              // DW_FORM_sec_offset
+	.byte	0                               // EOM(1)
+	.byte	0                               // EOM(2)
+	.byte	0                               // EOM(3)
+	.section	.debug_info,"",@progbits
+.Lcu_begin0:
+	.word	.Ldebug_info_end0-.Ldebug_info_start0 // Length of Unit
+.Ldebug_info_start0:
+	.hword	4                               // DWARF version number
+	.word	.debug_abbrev                   // Offset Into Abbrev. Section
+	.byte	8                               // Address Size (in bytes)
+	.byte	1                               // Abbrev [1] 0xb:0x25 DW_TAG_compile_unit
+	.word	.Lline_table_start0             // DW_AT_stmt_list
+	.word	.Lskel_string0                  // DW_AT_comp_dir
+                                        // DW_AT_GNU_pubnames
+	.word	.Lskel_string1                  // DW_AT_GNU_dwo_name
+	.xword	1465063543908291764             // DW_AT_GNU_dwo_id
+	.xword	.Lfunc_begin0                   // DW_AT_low_pc
+	.word	.Lfunc_end0-.Lfunc_begin0       // DW_AT_high_pc
+	.word	.Laddr_table_base0              // DW_AT_GNU_addr_base
+.Ldebug_info_end0:
+	.section	.debug_str,"MS",@progbits,1
+.Lskel_string0:
+	.asciz	"."                             // string offset=0
+.Lskel_string1:
+	.asciz	"main.exe-main.dwo"             // string offset=2
+	.section	.debug_str.dwo,"eMS",@progbits,1
+.Linfo_string0:
+	.asciz	"main"                          // string offset=0
+.Linfo_string1:
+	.asciz	"int"                           // string offset=5
+.Linfo_string2:
+	.byte	0                               // string offset=9
+.Linfo_string3:
+	.asciz	"main.cpp"                      // string offset=10
+.Linfo_string4:
+	.asciz	"main.exe-main.dwo"             // string offset=19
+	.section	.debug_str_offsets.dwo,"e",@progbits
+	.word	0
+	.word	5
+	.word	9
+	.word	10
+	.word	19
+	.section	.debug_info.dwo,"e",@progbits
+	.word	.Ldebug_info_dwo_end0-.Ldebug_info_dwo_start0 // Length of Unit
+.Ldebug_info_dwo_start0:
+	.hword	4                               // DWARF version number
+	.word	0                               // Offset Into Abbrev. Section
+	.byte	8                               // Address Size (in bytes)
+	.byte	1                               // Abbrev [1] 0xb:0x22 DW_TAG_compile_unit
+	.byte	2                               // DW_AT_producer
+	.hword	33                              // DW_AT_language
+	.byte	3                               // DW_AT_name
+	.byte	4                               // DW_AT_GNU_dwo_name
+	.xword	1465063543908291764             // DW_AT_GNU_dwo_id
+	.byte	2                               // Abbrev [2] 0x19:0xf DW_TAG_subprogram
+	.byte	0                               // DW_AT_low_pc
+	.word	.Lfunc_end0-.Lfunc_begin0       // DW_AT_high_pc
+	.byte	1                               // DW_AT_frame_base
+	.byte	109
+	.byte	0                               // DW_AT_name
+	.byte	1                               // DW_AT_decl_file
+	.byte	2                               // DW_AT_decl_line
+	.word	40                              // DW_AT_type
+                                        // DW_AT_external
+	.byte	3                               // Abbrev [3] 0x28:0x4 DW_TAG_base_type
+	.byte	1                               // DW_AT_name
+	.byte	5                               // DW_AT_encoding
+	.byte	4                               // DW_AT_byte_size
+	.byte	0                               // End Of Children Mark
+.Ldebug_info_dwo_end0:
+	.section	.debug_abbrev.dwo,"e",@progbits
+	.byte	1                               // Abbreviation Code
+	.byte	17                              // DW_TAG_compile_unit
+	.byte	1                               // DW_CHILDREN_yes
+	.byte	37                              // DW_AT_producer
+	.ascii	"\202>"                         // DW_FORM_GNU_str_index
+	.byte	19                              // DW_AT_language
+	.byte	5                               // DW_FORM_data2
+	.byte	3                               // DW_AT_name
+	.ascii	"\202>"                         // DW_FORM_GNU_str_index
+	.ascii	"\260B"                         // DW_AT_GNU_dwo_name
+	.ascii	"\202>"                         // DW_FORM_GNU_str_index
+	.ascii	"\261B"                         // DW_AT_GNU_dwo_id
+	.byte	7                               // DW_FORM_data8
+	.byte	0                               // EOM(1)
+	.byte	0                               // EOM(2)
+	.byte	2                               // Abbreviation Code
+	.byte	46                              // DW_TAG_subprogram
+	.byte	0                               // DW_CHILDREN_no
+	.byte	17                              // DW_AT_low_pc
+	.ascii	"\201>"                         // DW_FORM_GNU_addr_index
+	.byte	18                              // DW_AT_high_pc
+	.byte	6                               // DW_FORM_data4
+	.byte	64                              // DW_AT_frame_base
+	.byte	24                              // DW_FORM_exprloc
+	.byte	3                               // DW_AT_name
+	.ascii	"\202>"                         // DW_FORM_GNU_str_index
+	.byte	58                              // DW_AT_decl_file
+	.byte	11                              // DW_FORM_data1
+	.byte	59                              // DW_AT_decl_line
+	.byte	11                              // DW_FORM_data1
+	.byte	73                              // DW_AT_type
+	.byte	19                              // DW_FORM_ref4
+	.byte	63                              // DW_AT_external
+	.byte	25                              // DW_FORM_flag_present
+	.byte	0                               // EOM(1)
+	.byte	0                               // EOM(2)
+	.byte	3                               // Abbreviation Code
+	.byte	36                              // DW_TAG_base_type
+	.byte	0                               // DW_CHILDREN_no
+	.byte	3                               // DW_AT_name
+	.ascii	"\202>"                         // DW_FORM_GNU_str_index
+	.byte	62                              // DW_AT_encoding
+	.byte	11                              // DW_FORM_data1
+	.byte	11                              // DW_AT_byte_size
+	.byte	11                              // DW_FORM_data1
+	.byte	0                               // EOM(1)
+	.byte	0                               // EOM(2)
+	.byte	0                               // EOM(3)
+	.section	.debug_addr,"",@progbits
+.Laddr_table_base0:
+	.xword	.Lfunc_begin0
+	.section	.debug_gnu_pubnames,"",@progbits
+	.word	.LpubNames_end0-.LpubNames_start0 // Length of Public Names Info
+.LpubNames_start0:
+	.hword	2                               // DWARF Version
+	.word	.Lcu_begin0                     // Offset of Compilation Unit Info
+	.word	48                              // Compilation Unit Length
+	.word	25                              // DIE offset
+	.byte	48                              // Attributes: FUNCTION, EXTERNAL
+	.asciz	"main"                          // External Name
+	.word	0                               // End Mark
+.LpubNames_end0:
+	.section	.debug_gnu_pubtypes,"",@progbits
+	.word	.LpubTypes_end0-.LpubTypes_start0 // Length of Public Types Info
+.LpubTypes_start0:
+	.hword	2                               // DWARF Version
+	.word	.Lcu_begin0                     // Offset of Compilation Unit Info
+	.word	48                              // Compilation Unit Length
+	.word	40                              // DIE offset
+	.byte	144                             // Attributes: TYPE, STATIC
+	.asciz	"int"                           // External Name
+	.word	0                               // End Mark
+.LpubTypes_end0:
+	.section	".note.GNU-stack","",@progbits
+	.addrsig
+	.addrsig_sym _Z6calleei
+	.section	.debug_line,"",@progbits
+.Lline_table_start0:
+#--- callee.s
+	.file	"callee.cpp"
+	.globl	_Z6calleei                      // -- Begin function _Z6calleei
+	.type	_Z6calleei,@function
+_Z6calleei:                             // @_Z6calleei
+.Lfunc_begin0:
+	.file	1 "." "callee.cpp"
+	.loc	1 1 0                           // callee.cpp:1:0
+	.loc	1 1 28 prologue_end             // callee.cpp:1:28
+	.loc	1 1 21 epilogue_begin is_stmt 0 // callee.cpp:1:21
+	ret
+.Lfunc_end0:
+	.size	_Z6calleei, .Lfunc_end0-_Z6calleei
+	.section	.debug_abbrev,"",@progbits
+	.byte	1                               // Abbreviation Code
+	.byte	17                              // DW_TAG_compile_unit
+	.byte	0                               // DW_CHILDREN_no
+	.byte	16                              // DW_AT_stmt_list
+	.byte	23                              // DW_FORM_sec_offset
+	.byte	27                              // DW_AT_comp_dir
+	.byte	14                              // DW_FORM_strp
+	.ascii	"\264B"                         // DW_AT_GNU_pubnames
+	.byte	25                              // DW_FORM_flag_present
+	.ascii	"\260B"                         // DW_AT_GNU_dwo_name
+	.byte	14                              // DW_FORM_strp
+	.ascii	"\261B"                         // DW_AT_GNU_dwo_id
+	.byte	7                               // DW_FORM_data8
+	.byte	17                              // DW_AT_low_pc
+	.byte	1                               // DW_FORM_addr
+	.byte	18                              // DW_AT_high_pc
+	.byte	6                               // DW_FORM_data4
+	.ascii	"\263B"                         // DW_AT_GNU_addr_base
+	.byte	23                              // DW_FORM_sec_offset
+	.byte	0                               // EOM(1)
+	.byte	0                               // EOM(2)
+	.byte	0                               // EOM(3)
+	.section	.debug_info,"",@progbits
+.Lcu_begin0:
+	.word	.Ldebug_info_end0-.Ldebug_info_start0 // Length of Unit
+.Ldebug_info_start0:
+	.hword	4                               // DWARF version number
+	.word	.debug_abbrev                   // Offset Into Abbrev. Section
+	.byte	8                               // Address Size (in bytes)
+	.byte	1                               // Abbrev [1] 0xb:0x25 DW_TAG_compile_unit
+	.word	.Lline_table_start0             // DW_AT_stmt_list
+	.word	.Lskel_string0                  // DW_AT_comp_dir
+                                        // DW_AT_GNU_pubnames
+	.word	.Lskel_string1                  // DW_AT_GNU_dwo_name
+	.xword	7650227797527095061             // DW_AT_GNU_dwo_id
+	.xword	.Lfunc_begin0                   // DW_AT_low_pc
+	.word	.Lfunc_end0-.Lfunc_begin0       // DW_AT_high_pc
+	.word	.Laddr_table_base0              // DW_AT_GNU_addr_base
+.Ldebug_info_end0:
+	.section	.debug_str,"MS",@progbits,1
+.Lskel_string0:
+	.asciz	"."                             // string offset=0
+.Lskel_string1:
+	.asciz	"main.exe-callee.dwo"           // string offset=2
+	.section	.debug_str.dwo,"eMS",@progbits,1
+.Linfo_string0:
+	.asciz	"_Z6calleei"                    // string offset=0
+.Linfo_string1:
+	.asciz	"callee"                        // string offset=11
+.Linfo_string2:
+	.asciz	"int"                           // string offset=18
+.Linfo_string3:
+	.asciz	"x"                             // string offset=22
+.Linfo_string4:
+	.byte	0                               // string offset=24
+.Linfo_string5:
+	.asciz	"callee.cpp"                    // string offset=25
+.Linfo_string6:
+	.asciz	"main.exe-callee.dwo"           // string offset=36
+	.section	.debug_str_offsets.dwo,"e",@progbits
+	.word	0
+	.word	11
+	.word	18
+	.word	22
+	.word	24
+	.word	25
+	.word	36
+	.section	.debug_info.dwo,"e",@progbits
+	.word	.Ldebug_info_dwo_end0-.Ldebug_info_dwo_start0 // Length of Unit
+.Ldebug_info_dwo_start0:
+	.hword	4                               // DWARF version number
+	.word	0                               // Offset Into Abbrev. Section
+	.byte	8                               // Address Size (in bytes)
+	.byte	1                               // Abbrev [1] 0xb:0x2f DW_TAG_compile_unit
+	.byte	4                               // DW_AT_producer
+	.hword	33                              // DW_AT_language
+	.byte	5                               // DW_AT_name
+	.byte	6                               // DW_AT_GNU_dwo_name
+	.xword	7650227797527095061             // DW_AT_GNU_dwo_id
+	.byte	2                               // Abbrev [2] 0x19:0x1c DW_TAG_subprogram
+	.byte	0                               // DW_AT_low_pc
+	.word	.Lfunc_end0-.Lfunc_begin0       // DW_AT_high_pc
+	.byte	1                               // DW_AT_frame_base
+	.byte	111
+	.byte	0                               // DW_AT_linkage_name
+	.byte	1                               // DW_AT_name
+	.byte	1                               // DW_AT_decl_file
+	.byte	1                               // DW_AT_decl_line
+	.word	53                              // DW_AT_type
+                                        // DW_AT_external
+	.byte	3                               // Abbrev [3] 0x29:0xb DW_TAG_formal_parameter
+	.byte	2                               // DW_AT_location
+	.byte	145
+	.byte	12
+	.byte	3                               // DW_AT_name
+	.byte	1                               // DW_AT_decl_file
+	.byte	1                               // DW_AT_decl_line
+	.word	53                              // DW_AT_type
+	.byte	0                               // End Of Children Mark
+	.byte	4                               // Abbrev [4] 0x35:0x4 DW_TAG_base_type
+	.byte	2                               // DW_AT_name
+	.byte	5                               // DW_AT_encoding
+	.byte	4                               // DW_AT_byte_size
+	.byte	0                               // End Of Children Mark
+.Ldebug_info_dwo_end0:
+	.section	.debug_abbrev.dwo,"e",@progbits
+	.byte	1                               // Abbreviation Code
+	.byte	17                              // DW_TAG_compile_unit
+	.byte	1                               // DW_CHILDREN_yes
+	.byte	37                              // DW_AT_producer
+	.ascii	"\202>"                         // DW_FORM_GNU_str_index
+	.byte	19                              // DW_AT_language
+	.byte	5                               // DW_FORM_data2
+	.byte	3                               // DW_AT_name
+	.ascii	"\202>"                         // DW_FORM_GNU_str_index
+	.ascii	"\260B"                         // DW_AT_GNU_dwo_name
+	.ascii	"\202>"                         // DW_FORM_GNU_str_index
+	.ascii	"\261B"                         // DW_AT_GNU_dwo_id
+	.byte	7                               // DW_FORM_data8
+	.byte	0                               // EOM(1)
+	.byte	0                               // EOM(2)
+	.byte	2                               // Abbreviation Code
+	.byte	46                              // DW_TAG_subprogram
+	.byte	1                               // DW_CHILDREN_yes
+	.byte	17                              // DW_AT_low_pc
+	.ascii	"\201>"                         // DW_FORM_GNU_addr_index
+	.byte	18                              // DW_AT_high_pc
+	.byte	6                               // DW_FORM_data4
+	.byte	64                              // DW_AT_frame_base
+	.byte	24                              // DW_FORM_exprloc
+	.byte	110                             // DW_AT_linkage_name
+	.ascii	"\202>"                         // DW_FORM_GNU_str_index
+	.byte	3                               // DW_AT_name
+	.ascii	"\202>"                         // DW_FORM_GNU_str_index
+	.byte	58                              // DW_AT_decl_file
+	.byte	11                              // DW_FORM_data1
+	.byte	59                              // DW_AT_decl_line
+	.byte	11                              // DW_FORM_data1
+	.byte	73                              // DW_AT_type
+	.byte	19                              // DW_FORM_ref4
+	.byte	63                              // DW_AT_external
+	.byte	25                              // DW_FORM_flag_present
+	.byte	0                               // EOM(1)
+	.byte	0                               // EOM(2)
+	.byte	3                               // Abbreviation Code
+	.byte	5                               // DW_TAG_formal_parameter
+	.byte	0                               // DW_CHILDREN_no
+	.byte	2                               // DW_AT_location
+	.byte	24                              // DW_FORM_exprloc
+	.byte	3                               // DW_AT_name
+	.ascii	"\202>"                         // DW_FORM_GNU_str_index
+	.byte	58                              // DW_AT_decl_file
+	.byte	11                              // DW_FORM_data1
+	.byte	59                              // DW_AT_decl_line
+	.byte	11                              // DW_FORM_data1
+	.byte	73                              // DW_AT_type
+	.byte	19                              // DW_FORM_ref4
+	.byte	0                               // EOM(1)
+	.byte	0                               // EOM(2)
+	.byte	4                               // Abbreviation Code
+	.byte	36                              // DW_TAG_base_type
+	.byte	0                               // DW_CHILDREN_no
+	.byte	3                               // DW_AT_name
+	.ascii	"\202>"                         // DW_FORM_GNU_str_index
+	.byte	62                              // DW_AT_encoding
+	.byte	11                              // DW_FORM_data1
+	.byte	11                              // DW_AT_byte_size
+	.byte	11                              // DW_FORM_data1
+	.byte	0                               // EOM(1)
+	.byte	0                               // EOM(2)
+	.byte	0                               // EOM(3)
+	.section	.debug_addr,"",@progbits
+.Laddr_table_base0:
+	.xword	.Lfunc_begin0
+	.section	.debug_gnu_pubnames,"",@progbits
+	.word	.LpubNames_end0-.LpubNames_start0 // Length of Public Names Info
+.LpubNames_start0:
+	.hword	2                               // DWARF Version
+	.word	.Lcu_begin0                     // Offset of Compilation Unit Info
+	.word	48                              // Compilation Unit Length
+	.word	25                              // DIE offset
+	.byte	48                              // Attributes: FUNCTION, EXTERNAL
+	.asciz	"callee"                        // External Name
+	.word	0                               // End Mark
+.LpubNames_end0:
+	.section	.debug_gnu_pubtypes,"",@progbits
+	.word	.LpubTypes_end0-.LpubTypes_start0 // Length of Public Types Info
+.LpubTypes_start0:
+	.hword	2                               // DWARF Version
+	.word	.Lcu_begin0                     // Offset of Compilation Unit Info
+	.word	48                              // Compilation Unit Length
+	.word	53                              // DIE offset
+	.byte	144                             // Attributes: TYPE, STATIC
+	.asciz	"int"                           // External Name
+	.word	0                               // End Mark
+.LpubTypes_end0:
+	.section	".note.GNU-stack","",@progbits
+	.addrsig
+	.section	.debug_line,"",@progbits
+.Lline_table_start0:
diff --git a/bolt/test/X86/dwarf4-dwp-x86.s b/bolt/test/X86/dwarf4-dwp-x86.s
new file mode 100755
index 0000000..6dde167
--- /dev/null
+++ b/bolt/test/X86/dwarf4-dwp-x86.s
@@ -0,0 +1,405 @@
+## This test checks updating debuginfo via dwarf4 dwp file
+# RUN: rm -rf %t && mkdir -p %t && cd %t
+# RUN: split-file %s %t
+# RUN: %clangxx %cxxflags -g -gdwarf-4 -gsplit-dwarf %t/main.s %t/callee.s -o main.exe
+# RUN: llvm-dwp -e %t/main.exe -o %t/main.exe.dwp
+# RUN: llvm-bolt %t/main.exe -o %t/main.exe.bolt -update-debug-sections  2>&1 | FileCheck %s
+
+# CHECK-NOT: Assertion
+
+#--- main.s
+	.file	"main.cpp"
+	.globl	main                            # -- Begin function main
+	.type	main,@function
+main:                                   # @main
+.Lfunc_begin0:
+	.file	1 "." "main.cpp"
+	.loc	1 2 0                           # main.cpp:2:0
+	.loc	1 2 21 prologue_end             # main.cpp:2:21
+	.loc	1 2 14 epilogue_begin is_stmt 0 # main.cpp:2:14
+	retq
+.Lfunc_end0:
+	.size	main, .Lfunc_end0-main
+	.section	.debug_abbrev,"",@progbits
+	.byte	1                               # Abbreviation Code
+	.byte	17                              # DW_TAG_compile_unit
+	.byte	0                               # DW_CHILDREN_no
+	.byte	16                              # DW_AT_stmt_list
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	27                              # DW_AT_comp_dir
+	.byte	14                              # DW_FORM_strp
+	.ascii	"\264B"                         # DW_AT_GNU_pubnames
+	.byte	25                              # DW_FORM_flag_present
+	.ascii	"\260B"                         # DW_AT_GNU_dwo_name
+	.byte	14                              # DW_FORM_strp
+	.ascii	"\261B"                         # DW_AT_GNU_dwo_id
+	.byte	7                               # DW_FORM_data8
+	.byte	17                              # DW_AT_low_pc
+	.byte	1                               # DW_FORM_addr
+	.byte	18                              # DW_AT_high_pc
+	.byte	6                               # DW_FORM_data4
+	.ascii	"\263B"                         # DW_AT_GNU_addr_base
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	0                               # EOM(3)
+	.section	.debug_info,"",@progbits
+.Lcu_begin0:
+	.long	.Ldebug_info_end0-.Ldebug_info_start0 # Length of Unit
+.Ldebug_info_start0:
+	.short	4                               # DWARF version number
+	.long	.debug_abbrev                   # Offset Into Abbrev. Section
+	.byte	8                               # Address Size (in bytes)
+	.byte	1                               # Abbrev [1] 0xb:0x25 DW_TAG_compile_unit
+	.long	.Lline_table_start0             # DW_AT_stmt_list
+	.long	.Lskel_string0                  # DW_AT_comp_dir
+                                        # DW_AT_GNU_pubnames
+	.long	.Lskel_string1                  # DW_AT_GNU_dwo_name
+	.quad	1465063543908291764             # DW_AT_GNU_dwo_id
+	.quad	.Lfunc_begin0                   # DW_AT_low_pc
+	.long	.Lfunc_end0-.Lfunc_begin0       # DW_AT_high_pc
+	.long	.Laddr_table_base0              # DW_AT_GNU_addr_base
+.Ldebug_info_end0:
+	.section	.debug_str,"MS",@progbits,1
+.Lskel_string0:
+	.asciz	"."                             # string offset=0
+.Lskel_string1:
+	.asciz	"main.exe-main.dwo"             # string offset=2
+	.section	.debug_str.dwo,"eMS",@progbits,1
+.Linfo_string0:
+	.asciz	"main"                          # string offset=0
+.Linfo_string1:
+	.asciz	"int"                           # string offset=5
+.Linfo_string2:
+	.byte	0                               # string offset=9
+.Linfo_string3:
+	.asciz	"main.cpp"                      # string offset=10
+.Linfo_string4:
+	.asciz	"main.exe-main.dwo"             # string offset=19
+	.section	.debug_str_offsets.dwo,"e",@progbits
+	.long	0
+	.long	5
+	.long	9
+	.long	10
+	.long	19
+	.section	.debug_info.dwo,"e",@progbits
+	.long	.Ldebug_info_dwo_end0-.Ldebug_info_dwo_start0 # Length of Unit
+.Ldebug_info_dwo_start0:
+	.short	4                               # DWARF version number
+	.long	0                               # Offset Into Abbrev. Section
+	.byte	8                               # Address Size (in bytes)
+	.byte	1                               # Abbrev [1] 0xb:0x22 DW_TAG_compile_unit
+	.byte	2                               # DW_AT_producer
+	.short	33                              # DW_AT_language
+	.byte	3                               # DW_AT_name
+	.byte	4                               # DW_AT_GNU_dwo_name
+	.quad	1465063543908291764             # DW_AT_GNU_dwo_id
+	.byte	2                               # Abbrev [2] 0x19:0xf DW_TAG_subprogram
+	.byte	0                               # DW_AT_low_pc
+	.long	.Lfunc_end0-.Lfunc_begin0       # DW_AT_high_pc
+	.byte	1                               # DW_AT_frame_base
+	.byte	86
+	.byte	0                               # DW_AT_name
+	.byte	1                               # DW_AT_decl_file
+	.byte	2                               # DW_AT_decl_line
+	.long	40                              # DW_AT_type
+                                        # DW_AT_external
+	.byte	3                               # Abbrev [3] 0x28:0x4 DW_TAG_base_type
+	.byte	1                               # DW_AT_name
+	.byte	5                               # DW_AT_encoding
+	.byte	4                               # DW_AT_byte_size
+	.byte	0                               # End Of Children Mark
+.Ldebug_info_dwo_end0:
+	.section	.debug_abbrev.dwo,"e",@progbits
+	.byte	1                               # Abbreviation Code
+	.byte	17                              # DW_TAG_compile_unit
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	37                              # DW_AT_producer
+	.ascii	"\202>"                         # DW_FORM_GNU_str_index
+	.byte	19                              # DW_AT_language
+	.byte	5                               # DW_FORM_data2
+	.byte	3                               # DW_AT_name
+	.ascii	"\202>"                         # DW_FORM_GNU_str_index
+	.ascii	"\260B"                         # DW_AT_GNU_dwo_name
+	.ascii	"\202>"                         # DW_FORM_GNU_str_index
+	.ascii	"\261B"                         # DW_AT_GNU_dwo_id
+	.byte	7                               # DW_FORM_data8
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	2                               # Abbreviation Code
+	.byte	46                              # DW_TAG_subprogram
+	.byte	0                               # DW_CHILDREN_no
+	.byte	17                              # DW_AT_low_pc
+	.ascii	"\201>"                         # DW_FORM_GNU_addr_index
+	.byte	18                              # DW_AT_high_pc
+	.byte	6                               # DW_FORM_data4
+	.byte	64                              # DW_AT_frame_base
+	.byte	24                              # DW_FORM_exprloc
+	.byte	3                               # DW_AT_name
+	.ascii	"\202>"                         # DW_FORM_GNU_str_index
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	63                              # DW_AT_external
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	3                               # Abbreviation Code
+	.byte	36                              # DW_TAG_base_type
+	.byte	0                               # DW_CHILDREN_no
+	.byte	3                               # DW_AT_name
+	.ascii	"\202>"                         # DW_FORM_GNU_str_index
+	.byte	62                              # DW_AT_encoding
+	.byte	11                              # DW_FORM_data1
+	.byte	11                              # DW_AT_byte_size
+	.byte	11                              # DW_FORM_data1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	0                               # EOM(3)
+	.section	.debug_addr,"",@progbits
+.Laddr_table_base0:
+	.quad	.Lfunc_begin0
+	.section	.debug_gnu_pubnames,"",@progbits
+	.long	.LpubNames_end0-.LpubNames_start0 # Length of Public Names Info
+.LpubNames_start0:
+	.short	2                               # DWARF Version
+	.long	.Lcu_begin0                     # Offset of Compilation Unit Info
+	.long	48                              # Compilation Unit Length
+	.long	25                              # DIE offset
+	.byte	48                              # Attributes: FUNCTION, EXTERNAL
+	.asciz	"main"                          # External Name
+	.long	0                               # End Mark
+.LpubNames_end0:
+	.section	.debug_gnu_pubtypes,"",@progbits
+	.long	.LpubTypes_end0-.LpubTypes_start0 # Length of Public Types Info
+.LpubTypes_start0:
+	.short	2                               # DWARF Version
+	.long	.Lcu_begin0                     # Offset of Compilation Unit Info
+	.long	48                              # Compilation Unit Length
+	.long	40                              # DIE offset
+	.byte	144                             # Attributes: TYPE, STATIC
+	.asciz	"int"                           # External Name
+	.long	0                               # End Mark
+.LpubTypes_end0:
+	.section	".note.GNU-stack","",@progbits
+	.addrsig
+	.addrsig_sym _Z6calleei
+	.section	.debug_line,"",@progbits
+.Lline_table_start0:
+#--- callee.s
+	.file	"callee.cpp"
+	.globl	_Z6calleei                      # -- Begin function _Z6calleei
+	.type	_Z6calleei,@function
+_Z6calleei:                             # @_Z6calleei
+.Lfunc_begin0:
+	.file	1 "." "callee.cpp"
+	.loc	1 1 0                           # callee.cpp:1:0
+	.loc	1 1 28 prologue_end             # callee.cpp:1:28
+	.loc	1 1 21 epilogue_begin is_stmt 0 # callee.cpp:1:21
+	retq
+.Lfunc_end0:
+	.size	_Z6calleei, .Lfunc_end0-_Z6calleei
+	.section	.debug_abbrev,"",@progbits
+	.byte	1                               # Abbreviation Code
+	.byte	17                              # DW_TAG_compile_unit
+	.byte	0                               # DW_CHILDREN_no
+	.byte	16                              # DW_AT_stmt_list
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	27                              # DW_AT_comp_dir
+	.byte	14                              # DW_FORM_strp
+	.ascii	"\264B"                         # DW_AT_GNU_pubnames
+	.byte	25                              # DW_FORM_flag_present
+	.ascii	"\260B"                         # DW_AT_GNU_dwo_name
+	.byte	14                              # DW_FORM_strp
+	.ascii	"\261B"                         # DW_AT_GNU_dwo_id
+	.byte	7                               # DW_FORM_data8
+	.byte	17                              # DW_AT_low_pc
+	.byte	1                               # DW_FORM_addr
+	.byte	18                              # DW_AT_high_pc
+	.byte	6                               # DW_FORM_data4
+	.ascii	"\263B"                         # DW_AT_GNU_addr_base
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	0                               # EOM(3)
+	.section	.debug_info,"",@progbits
+.Lcu_begin0:
+	.long	.Ldebug_info_end0-.Ldebug_info_start0 # Length of Unit
+.Ldebug_info_start0:
+	.short	4                               # DWARF version number
+	.long	.debug_abbrev                   # Offset Into Abbrev. Section
+	.byte	8                               # Address Size (in bytes)
+	.byte	1                               # Abbrev [1] 0xb:0x25 DW_TAG_compile_unit
+	.long	.Lline_table_start0             # DW_AT_stmt_list
+	.long	.Lskel_string0                  # DW_AT_comp_dir
+                                        # DW_AT_GNU_pubnames
+	.long	.Lskel_string1                  # DW_AT_GNU_dwo_name
+	.quad	-8413212350243343807            # DW_AT_GNU_dwo_id
+	.quad	.Lfunc_begin0                   # DW_AT_low_pc
+	.long	.Lfunc_end0-.Lfunc_begin0       # DW_AT_high_pc
+	.long	.Laddr_table_base0              # DW_AT_GNU_addr_base
+.Ldebug_info_end0:
+	.section	.debug_str,"MS",@progbits,1
+.Lskel_string0:
+	.asciz	"."                             # string offset=0
+.Lskel_string1:
+	.asciz	"main.exe-callee.dwo"           # string offset=2
+	.section	.debug_str.dwo,"eMS",@progbits,1
+.Linfo_string0:
+	.asciz	"_Z6calleei"                    # string offset=0
+.Linfo_string1:
+	.asciz	"callee"                        # string offset=11
+.Linfo_string2:
+	.asciz	"int"                           # string offset=18
+.Linfo_string3:
+	.asciz	"x"                             # string offset=22
+.Linfo_string4:
+	.byte	0                               # string offset=24
+.Linfo_string5:
+	.asciz	"callee.cpp"                    # string offset=25
+.Linfo_string6:
+	.asciz	"main.exe-callee.dwo"           # string offset=36
+	.section	.debug_str_offsets.dwo,"e",@progbits
+	.long	0
+	.long	11
+	.long	18
+	.long	22
+	.long	24
+	.long	25
+	.long	36
+	.section	.debug_info.dwo,"e",@progbits
+	.long	.Ldebug_info_dwo_end0-.Ldebug_info_dwo_start0 # Length of Unit
+.Ldebug_info_dwo_start0:
+	.short	4                               # DWARF version number
+	.long	0                               # Offset Into Abbrev. Section
+	.byte	8                               # Address Size (in bytes)
+	.byte	1                               # Abbrev [1] 0xb:0x2f DW_TAG_compile_unit
+	.byte	4                               # DW_AT_producer
+	.short	33                              # DW_AT_language
+	.byte	5                               # DW_AT_name
+	.byte	6                               # DW_AT_GNU_dwo_name
+	.quad	-8413212350243343807            # DW_AT_GNU_dwo_id
+	.byte	2                               # Abbrev [2] 0x19:0x1c DW_TAG_subprogram
+	.byte	0                               # DW_AT_low_pc
+	.long	.Lfunc_end0-.Lfunc_begin0       # DW_AT_high_pc
+	.byte	1                               # DW_AT_frame_base
+	.byte	86
+	.byte	0                               # DW_AT_linkage_name
+	.byte	1                               # DW_AT_name
+	.byte	1                               # DW_AT_decl_file
+	.byte	1                               # DW_AT_decl_line
+	.long	53                              # DW_AT_type
+                                        # DW_AT_external
+	.byte	3                               # Abbrev [3] 0x29:0xb DW_TAG_formal_parameter
+	.byte	2                               # DW_AT_location
+	.byte	145
+	.byte	124
+	.byte	3                               # DW_AT_name
+	.byte	1                               # DW_AT_decl_file
+	.byte	1                               # DW_AT_decl_line
+	.long	53                              # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	4                               # Abbrev [4] 0x35:0x4 DW_TAG_base_type
+	.byte	2                               # DW_AT_name
+	.byte	5                               # DW_AT_encoding
+	.byte	4                               # DW_AT_byte_size
+	.byte	0                               # End Of Children Mark
+.Ldebug_info_dwo_end0:
+	.section	.debug_abbrev.dwo,"e",@progbits
+	.byte	1                               # Abbreviation Code
+	.byte	17                              # DW_TAG_compile_unit
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	37                              # DW_AT_producer
+	.ascii	"\202>"                         # DW_FORM_GNU_str_index
+	.byte	19                              # DW_AT_language
+	.byte	5                               # DW_FORM_data2
+	.byte	3                               # DW_AT_name
+	.ascii	"\202>"                         # DW_FORM_GNU_str_index
+	.ascii	"\260B"                         # DW_AT_GNU_dwo_name
+	.ascii	"\202>"                         # DW_FORM_GNU_str_index
+	.ascii	"\261B"                         # DW_AT_GNU_dwo_id
+	.byte	7                               # DW_FORM_data8
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	2                               # Abbreviation Code
+	.byte	46                              # DW_TAG_subprogram
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	17                              # DW_AT_low_pc
+	.ascii	"\201>"                         # DW_FORM_GNU_addr_index
+	.byte	18                              # DW_AT_high_pc
+	.byte	6                               # DW_FORM_data4
+	.byte	64                              # DW_AT_frame_base
+	.byte	24                              # DW_FORM_exprloc
+	.byte	110                             # DW_AT_linkage_name
+	.ascii	"\202>"                         # DW_FORM_GNU_str_index
+	.byte	3                               # DW_AT_name
+	.ascii	"\202>"                         # DW_FORM_GNU_str_index
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	63                              # DW_AT_external
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	3                               # Abbreviation Code
+	.byte	5                               # DW_TAG_formal_parameter
+	.byte	0                               # DW_CHILDREN_no
+	.byte	2                               # DW_AT_location
+	.byte	24                              # DW_FORM_exprloc
+	.byte	3                               # DW_AT_name
+	.ascii	"\202>"                         # DW_FORM_GNU_str_index
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	4                               # Abbreviation Code
+	.byte	36                              # DW_TAG_base_type
+	.byte	0                               # DW_CHILDREN_no
+	.byte	3                               # DW_AT_name
+	.ascii	"\202>"                         # DW_FORM_GNU_str_index
+	.byte	62                              # DW_AT_encoding
+	.byte	11                              # DW_FORM_data1
+	.byte	11                              # DW_AT_byte_size
+	.byte	11                              # DW_FORM_data1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	0                               # EOM(3)
+	.section	.debug_addr,"",@progbits
+.Laddr_table_base0:
+	.quad	.Lfunc_begin0
+	.section	.debug_gnu_pubnames,"",@progbits
+	.long	.LpubNames_end0-.LpubNames_start0 # Length of Public Names Info
+.LpubNames_start0:
+	.short	2                               # DWARF Version
+	.long	.Lcu_begin0                     # Offset of Compilation Unit Info
+	.long	48                              # Compilation Unit Length
+	.long	25                              # DIE offset
+	.byte	48                              # Attributes: FUNCTION, EXTERNAL
+	.asciz	"callee"                        # External Name
+	.long	0                               # End Mark
+.LpubNames_end0:
+	.section	.debug_gnu_pubtypes,"",@progbits
+	.long	.LpubTypes_end0-.LpubTypes_start0 # Length of Public Types Info
+.LpubTypes_start0:
+	.short	2                               # DWARF Version
+	.long	.Lcu_begin0                     # Offset of Compilation Unit Info
+	.long	48                              # Compilation Unit Length
+	.long	53                              # DIE offset
+	.byte	144                             # Attributes: TYPE, STATIC
+	.asciz	"int"                           # External Name
+	.long	0                               # End Mark
+.LpubTypes_end0:
+	.section	".note.GNU-stack","",@progbits
+	.addrsig
+	.section	.debug_line,"",@progbits
+.Lline_table_start0:
diff --git a/clang-tools-extra/clang-reorder-fields/ReorderFieldsAction.cpp b/clang-tools-extra/clang-reorder-fields/ReorderFieldsAction.cpp
index affa276..5770bf7 100644
--- a/clang-tools-extra/clang-reorder-fields/ReorderFieldsAction.cpp
+++ b/clang-tools-extra/clang-reorder-fields/ReorderFieldsAction.cpp
@@ -164,6 +164,22 @@ getNewFieldsOrder(const RecordDecl *Definition,
   return NewFieldsOrder;
 }
 
+static bool isOrderValid(const RecordDecl *RD, ArrayRef<unsigned> FieldOrder) {
+  if (FieldOrder.empty())
+    return false;
+
+  // If there is a flexible array member in the struct, it must remain the last
+  // field.
+  if (RD->hasFlexibleArrayMember() &&
+      FieldOrder.back() != FieldOrder.size() - 1) {
+    llvm::errs()
+        << "Flexible array member must remain the last field in the struct\n";
+    return false;
+  }
+
+  return true;
+}
+
 struct ReorderedStruct {
 public:
   ReorderedStruct(const RecordDecl *Decl, ArrayRef<unsigned> NewFieldsOrder)
@@ -662,7 +678,7 @@ public:
       return;
     SmallVector<unsigned, 4> NewFieldsOrder =
         getNewFieldsOrder(RD, DesiredFieldsOrder);
-    if (NewFieldsOrder.empty())
+    if (!isOrderValid(RD, NewFieldsOrder))
       return;
     ReorderedStruct RS{RD, NewFieldsOrder};
 
@@ -699,7 +715,7 @@ public:
 
 std::unique_ptr<ASTConsumer> ReorderFieldsAction::newASTConsumer() {
   return std::make_unique<ReorderingConsumer>(RecordName, DesiredFieldsOrder,
-                                               Replacements);
+                                              Replacements);
 }
 
 } // namespace reorder_fields
diff --git a/clang-tools-extra/clang-tidy/android/ComparisonInTempFailureRetryCheck.cpp b/clang-tools-extra/clang-tidy/android/ComparisonInTempFailureRetryCheck.cpp
index 78e58bc..36ac9a4 100644
--- a/clang-tools-extra/clang-tidy/android/ComparisonInTempFailureRetryCheck.cpp
+++ b/clang-tools-extra/clang-tidy/android/ComparisonInTempFailureRetryCheck.cpp
@@ -19,7 +19,7 @@ ComparisonInTempFailureRetryCheck::ComparisonInTempFailureRetryCheck(
     StringRef Name, ClangTidyContext *Context)
     : ClangTidyCheck(Name, Context),
       RawRetryList(Options.get("RetryMacros", "TEMP_FAILURE_RETRY")) {
-  StringRef(RawRetryList).split(RetryMacros, ",", -1, false);
+  RawRetryList.split(RetryMacros, ",", -1, false);
 }
 
 void ComparisonInTempFailureRetryCheck::storeOptions(
diff --git a/clang-tools-extra/clang-tidy/bugprone/AssertSideEffectCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/AssertSideEffectCheck.cpp
index 227641d..1700502 100644
--- a/clang-tools-extra/clang-tidy/bugprone/AssertSideEffectCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/AssertSideEffectCheck.cpp
@@ -92,7 +92,7 @@ AssertSideEffectCheck::AssertSideEffectCheck(StringRef Name,
       RawAssertList(Options.get("AssertMacros", "assert,NSAssert,NSCAssert")),
       IgnoredFunctions(utils::options::parseListPair(
           "__builtin_expect;", Options.get("IgnoredFunctions", ""))) {
-  StringRef(RawAssertList).split(AssertMacros, ",", -1, false);
+  RawAssertList.split(AssertMacros, ",", -1, false);
 }
 
 // The options are explained in AssertSideEffectCheck.h.
diff --git a/clang-tools-extra/clang-tidy/bugprone/ExceptionEscapeCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/ExceptionEscapeCheck.cpp
index 3d839b5..837a86f 100644
--- a/clang-tools-extra/clang-tidy/bugprone/ExceptionEscapeCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/ExceptionEscapeCheck.cpp
@@ -39,12 +39,12 @@ ExceptionEscapeCheck::ExceptionEscapeCheck(StringRef Name,
       RawIgnoredExceptions(Options.get("IgnoredExceptions", "")) {
   llvm::SmallVector<StringRef, 8> FunctionsThatShouldNotThrowVec,
       IgnoredExceptionsVec;
-  StringRef(RawFunctionsThatShouldNotThrow)
-      .split(FunctionsThatShouldNotThrowVec, ",", -1, false);
+  RawFunctionsThatShouldNotThrow.split(FunctionsThatShouldNotThrowVec, ",", -1,
+                                       false);
   FunctionsThatShouldNotThrow.insert_range(FunctionsThatShouldNotThrowVec);
 
   llvm::StringSet<> IgnoredExceptions;
-  StringRef(RawIgnoredExceptions).split(IgnoredExceptionsVec, ",", -1, false);
+  RawIgnoredExceptions.split(IgnoredExceptionsVec, ",", -1, false);
   IgnoredExceptions.insert_range(IgnoredExceptionsVec);
   Tracer.ignoreExceptions(std::move(IgnoredExceptions));
   Tracer.ignoreBadAlloc(true);
diff --git a/clang-tools-extra/clang-tidy/bugprone/ExceptionEscapeCheck.h b/clang-tools-extra/clang-tidy/bugprone/ExceptionEscapeCheck.h
index ae6e202..bd1e7ba 100644
--- a/clang-tools-extra/clang-tidy/bugprone/ExceptionEscapeCheck.h
+++ b/clang-tools-extra/clang-tidy/bugprone/ExceptionEscapeCheck.h
@@ -33,8 +33,8 @@ public:
   void check(const ast_matchers::MatchFinder::MatchResult &Result) override;
 
 private:
-  std::string RawFunctionsThatShouldNotThrow;
-  std::string RawIgnoredExceptions;
+  StringRef RawFunctionsThatShouldNotThrow;
+  StringRef RawIgnoredExceptions;
 
   llvm::StringSet<> FunctionsThatShouldNotThrow;
   utils::ExceptionAnalyzer Tracer;
diff --git a/clang-tools-extra/clang-tidy/cert/ProperlySeededRandomGeneratorCheck.cpp b/clang-tools-extra/clang-tidy/cert/ProperlySeededRandomGeneratorCheck.cpp
index aa95fad..b8bca72 100644
--- a/clang-tools-extra/clang-tidy/cert/ProperlySeededRandomGeneratorCheck.cpp
+++ b/clang-tools-extra/clang-tidy/cert/ProperlySeededRandomGeneratorCheck.cpp
@@ -20,7 +20,7 @@ ProperlySeededRandomGeneratorCheck::ProperlySeededRandomGeneratorCheck(
     : ClangTidyCheck(Name, Context),
       RawDisallowedSeedTypes(
           Options.get("DisallowedSeedTypes", "time_t,std::time_t")) {
-  StringRef(RawDisallowedSeedTypes).split(DisallowedSeedTypes, ',');
+  RawDisallowedSeedTypes.split(DisallowedSeedTypes, ',');
 }
 
 void ProperlySeededRandomGeneratorCheck::storeOptions(
diff --git a/clang-tools-extra/clang-tidy/cert/ProperlySeededRandomGeneratorCheck.h b/clang-tools-extra/clang-tidy/cert/ProperlySeededRandomGeneratorCheck.h
index ea30127..7da01cc 100644
--- a/clang-tools-extra/clang-tidy/cert/ProperlySeededRandomGeneratorCheck.h
+++ b/clang-tools-extra/clang-tidy/cert/ProperlySeededRandomGeneratorCheck.h
@@ -33,7 +33,7 @@ private:
   void checkSeed(const ast_matchers::MatchFinder::MatchResult &Result,
                  const T *Func);
 
-  std::string RawDisallowedSeedTypes;
+  StringRef RawDisallowedSeedTypes;
   SmallVector<StringRef, 5> DisallowedSeedTypes;
 };
 
diff --git a/clang-tools-extra/clang-tidy/modernize/UseNullptrCheck.cpp b/clang-tools-extra/clang-tidy/modernize/UseNullptrCheck.cpp
index 4dc4bae..b921819 100644
--- a/clang-tools-extra/clang-tidy/modernize/UseNullptrCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/UseNullptrCheck.cpp
@@ -53,7 +53,7 @@ StatementMatcher makeCastSequenceMatcher(llvm::ArrayRef<StringRef> NameList) {
       unless(hasImplicitDestinationType(
           qualType(matchers::matchesAnyListedTypeName(NameList)))));
 
-  auto IsOrHasDescendant = [](auto InnerMatcher) {
+  auto IsOrHasDescendant = [](const auto &InnerMatcher) {
     return anyOf(InnerMatcher, hasDescendant(InnerMatcher));
   };
 
@@ -494,7 +494,7 @@ UseNullptrCheck::UseNullptrCheck(StringRef Name, ClangTidyContext *Context)
       NullMacrosStr(Options.get("NullMacros", "NULL")),
       IgnoredTypes(utils::options::parseStringList(Options.get(
           "IgnoredTypes", "_CmpUnspecifiedParam;^std::__cmp_cat::__unspec"))) {
-  StringRef(NullMacrosStr).split(NullMacros, ",");
+  NullMacrosStr.split(NullMacros, ",");
 }
 
 void UseNullptrCheck::storeOptions(ClangTidyOptions::OptionMap &Opts) {
diff --git a/clang-tools-extra/clang-tidy/openmp/ExceptionEscapeCheck.cpp b/clang-tools-extra/clang-tidy/openmp/ExceptionEscapeCheck.cpp
index f9becee..3801fc0 100644
--- a/clang-tools-extra/clang-tidy/openmp/ExceptionEscapeCheck.cpp
+++ b/clang-tools-extra/clang-tidy/openmp/ExceptionEscapeCheck.cpp
@@ -23,7 +23,7 @@ ExceptionEscapeCheck::ExceptionEscapeCheck(StringRef Name,
   llvm::SmallVector<StringRef, 8> IgnoredExceptionsVec;
 
   llvm::StringSet<> IgnoredExceptions;
-  StringRef(RawIgnoredExceptions).split(IgnoredExceptionsVec, ",", -1, false);
+  RawIgnoredExceptions.split(IgnoredExceptionsVec, ",", -1, false);
   llvm::transform(IgnoredExceptionsVec, IgnoredExceptionsVec.begin(),
                   [](StringRef S) { return S.trim(); });
   IgnoredExceptions.insert_range(IgnoredExceptionsVec);
diff --git a/clang-tools-extra/clang-tidy/openmp/ExceptionEscapeCheck.h b/clang-tools-extra/clang-tidy/openmp/ExceptionEscapeCheck.h
index 39da124..757a933 100644
--- a/clang-tools-extra/clang-tidy/openmp/ExceptionEscapeCheck.h
+++ b/clang-tools-extra/clang-tidy/openmp/ExceptionEscapeCheck.h
@@ -30,7 +30,7 @@ public:
   void check(const ast_matchers::MatchFinder::MatchResult &Result) override;
 
 private:
-  std::string RawIgnoredExceptions;
+  StringRef RawIgnoredExceptions;
 
   utils::ExceptionAnalyzer Tracer;
 };
diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst
index 9257dc6..c3a6d2f 100644
--- a/clang-tools-extra/docs/ReleaseNotes.rst
+++ b/clang-tools-extra/docs/ReleaseNotes.rst
@@ -329,6 +329,11 @@ Changes in existing checks
   <clang-tidy/checks/modernize/use-designated-initializers>` check to
   suggest using designated initializers for aliased aggregate types.
 
+- Improved :doc:`modernize-use-nullptr
+  <clang-tidy/checks/modernize/use-nullptr>` check by fixing a crash
+  on Windows when the check was enabled with a 32-bit :program:`clang-tidy`
+  binary.
+
 - Improved :doc:`modernize-use-std-format
   <clang-tidy/checks/modernize/use-std-format>` check to correctly match
   when the format string is converted to a different type by an implicit
diff --git a/clang-tools-extra/test/clang-reorder-fields/FlexibleArrayMember.c b/clang-tools-extra/test/clang-reorder-fields/FlexibleArrayMember.c
new file mode 100644
index 0000000..ef64350
--- /dev/null
+++ b/clang-tools-extra/test/clang-reorder-fields/FlexibleArrayMember.c
@@ -0,0 +1,10 @@
+// RUN: clang-reorder-fields -record-name Foo -fields-order z,y,x %s -- 2>&1 | FileCheck --check-prefix=CHECK-BAD %s
+// RUN: clang-reorder-fields -record-name Foo -fields-order y,x,z %s -- | FileCheck --check-prefix=CHECK-GOOD %s
+
+// CHECK-BAD: {{^Flexible array member must remain the last field in the struct}}
+
+struct Foo {
+  int x;   // CHECK-GOOD:      {{^  int y;}}
+  int y;   // CHECK-GOOD-NEXT: {{^  int x;}}
+  int z[]; // CHECK-GOOD-NEXT: {{^  int z\[\];}}
+};
diff --git a/clang/bindings/python/clang/cindex.py b/clang/bindings/python/clang/cindex.py
index c44e646..80140d2 100644
--- a/clang/bindings/python/clang/cindex.py
+++ b/clang/bindings/python/clang/cindex.py
@@ -1446,6 +1446,9 @@ class CursorKind(BaseEnumeration):
     # OpenMP stripe directive.
     OMP_STRIPE_DIRECTIVE = 310
 
+    # OpenMP fuse directive.
+    OMP_FUSE_DIRECTIVE = 311
+
     # OpenACC Compute Construct.
     OPEN_ACC_COMPUTE_DIRECTIVE = 320
 
diff --git a/clang/docs/OpenMPSupport.rst b/clang/docs/OpenMPSupport.rst
index 6108e54..68ca7be 100644
--- a/clang/docs/OpenMPSupport.rst
+++ b/clang/docs/OpenMPSupport.rst
@@ -482,6 +482,8 @@ implementation.
 +-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
 | loop transformation apply clause                            | :none:`unclaimed`         | :none:`unclaimed`         |                                                                          |
 +-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
+| loop fuse transformation                                    | :good:`done`              | :none:`unclaimed`         |                                                                          |
++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
 | workdistribute construct                                    |                           | :none:`in progress`       | @skc7, @mjklemm                                                          |
 +-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
 | task_iteration                                              | :none:`unclaimed`         | :none:`unclaimed`         |                                                                          |
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 98c889c..79dc0b2 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -432,6 +432,9 @@ Bug Fixes to C++ Support
 - Fix an assertion failure when taking the address on a non-type template parameter argument of
   object type. (#GH151531)
 - Suppress ``-Wdouble-promotion`` when explicitly asked for with C++ list initialization (#GH33409).
+- Fix the result of `__builtin_is_implicit_lifetime` for types with a user-provided constructor. (#GH160610)
+- Correctly deduce return types in ``decltype`` expressions. (#GH160497) (#GH56652) (#GH116319) (#GH161196)
+- Fixed a crash in the pre-C++23 warning for attributes before a lambda declarator (#GH161070).
 
 Bug Fixes to AST Handling
 ^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -599,6 +602,7 @@ OpenMP Support
 - Added support for ``defaultmap`` directive implicit-behavior ``storage``.
 - Added support for ``defaultmap`` directive implicit-behavior ``private``.
 - Added parsing and semantic analysis support for ``groupprivate`` directive.
+- Added support for 'omp fuse' directive.
 
 Improvements
 ^^^^^^^^^^^^
diff --git a/clang/include/clang-c/Index.h b/clang/include/clang-c/Index.h
index be038d9..f13d9c9 100644
--- a/clang/include/clang-c/Index.h
+++ b/clang/include/clang-c/Index.h
@@ -2162,6 +2162,10 @@ enum CXCursorKind {
    */
   CXCursor_OMPStripeDirective = 310,
 
+  /** OpenMP fuse directive
+   */
+  CXCursor_OMPFuseDirective = 311,
+
   /** OpenACC Compute Construct.
    */
   CXCursor_OpenACCComputeConstruct = 320,
diff --git a/clang/include/clang/AST/OpenMPClause.h b/clang/include/clang/AST/OpenMPClause.h
index 42b4268..68d220a 100644
--- a/clang/include/clang/AST/OpenMPClause.h
+++ b/clang/include/clang/AST/OpenMPClause.h
@@ -1149,6 +1149,80 @@ public:
   static OMPFullClause *CreateEmpty(const ASTContext &C);
 };
 
+/// This class represents the 'looprange' clause in the
+/// '#pragma omp fuse' directive
+///
+/// \code {c}
+/// #pragma omp fuse looprange(1,2)
+/// {
+///   for(int i = 0; i < 64; ++i)
+///   for(int j = 0; j < 256; j+=2)
+///   for(int k = 127; k >= 0; --k)
+/// \endcode
+class OMPLoopRangeClause final : public OMPClause {
+  friend class OMPClauseReader;
+  /// Location of '('
+  SourceLocation LParenLoc;
+
+  /// Location of first and count expressions
+  SourceLocation FirstLoc, CountLoc;
+
+  /// Number of looprange arguments (always 2: first, count)
+  enum { FirstExpr, CountExpr, NumArgs };
+  Stmt *Args[NumArgs] = {nullptr, nullptr};
+
+  /// Set looprange 'first' expression
+  void setFirst(Expr *E) { Args[FirstExpr] = E; }
+
+  /// Set looprange 'count' expression
+  void setCount(Expr *E) { Args[CountExpr] = E; }
+
+  /// Build an empty clause for deserialization.
+  explicit OMPLoopRangeClause()
+      : OMPClause(llvm::omp::OMPC_looprange, {}, {}) {}
+
+public:
+  /// Build a 'looprange' clause AST node.
+  static OMPLoopRangeClause *
+  Create(const ASTContext &C, SourceLocation StartLoc, SourceLocation LParenLoc,
+         SourceLocation FirstLoc, SourceLocation CountLoc,
+         SourceLocation EndLoc, Expr *First, Expr *Count);
+
+  /// Build an empty 'looprange' clause node.
+  static OMPLoopRangeClause *CreateEmpty(const ASTContext &C);
+
+  // Location getters/setters
+  SourceLocation getLParenLoc() const { return LParenLoc; }
+  SourceLocation getFirstLoc() const { return FirstLoc; }
+  SourceLocation getCountLoc() const { return CountLoc; }
+
+  void setLParenLoc(SourceLocation Loc) { LParenLoc = Loc; }
+  void setFirstLoc(SourceLocation Loc) { FirstLoc = Loc; }
+  void setCountLoc(SourceLocation Loc) { CountLoc = Loc; }
+
+  /// Get looprange 'first' expression
+  Expr *getFirst() const { return cast_or_null<Expr>(Args[FirstExpr]); }
+
+  /// Get looprange 'count' expression
+  Expr *getCount() const { return cast_or_null<Expr>(Args[CountExpr]); }
+
+  child_range children() { return child_range(Args, Args + NumArgs); }
+  const_child_range children() const {
+    return const_child_range(Args, Args + NumArgs);
+  }
+
+  child_range used_children() {
+    return child_range(child_iterator(), child_iterator());
+  }
+  const_child_range used_children() const {
+    return const_child_range(const_child_iterator(), const_child_iterator());
+  }
+
+  static bool classof(const OMPClause *T) {
+    return T->getClauseKind() == llvm::omp::OMPC_looprange;
+  }
+};
+
 /// Representation of the 'partial' clause of the '#pragma omp unroll'
 /// directive.
 ///
diff --git a/clang/include/clang/AST/RecursiveASTVisitor.h b/clang/include/clang/AST/RecursiveASTVisitor.h
index af1a073..7a2881f 100644
--- a/clang/include/clang/AST/RecursiveASTVisitor.h
+++ b/clang/include/clang/AST/RecursiveASTVisitor.h
@@ -3177,6 +3177,9 @@ DEF_TRAVERSE_STMT(OMPUnrollDirective,
 DEF_TRAVERSE_STMT(OMPReverseDirective,
                   { TRY_TO(TraverseOMPExecutableDirective(S)); })
 
+DEF_TRAVERSE_STMT(OMPFuseDirective,
+                  { TRY_TO(TraverseOMPExecutableDirective(S)); })
+
 DEF_TRAVERSE_STMT(OMPInterchangeDirective,
                   { TRY_TO(TraverseOMPExecutableDirective(S)); })
 
@@ -3495,6 +3498,14 @@ bool RecursiveASTVisitor<Derived>::VisitOMPFullClause(OMPFullClause *C) {
 }
 
 template <typename Derived>
+bool RecursiveASTVisitor<Derived>::VisitOMPLoopRangeClause(
+    OMPLoopRangeClause *C) {
+  TRY_TO(TraverseStmt(C->getFirst()));
+  TRY_TO(TraverseStmt(C->getCount()));
+  return true;
+}
+
+template <typename Derived>
 bool RecursiveASTVisitor<Derived>::VisitOMPPartialClause(OMPPartialClause *C) {
   TRY_TO(TraverseStmt(C->getFactor()));
   return true;
diff --git a/clang/include/clang/AST/StmtOpenMP.h b/clang/include/clang/AST/StmtOpenMP.h
index d9f87f1..bc6aeaa8 100644
--- a/clang/include/clang/AST/StmtOpenMP.h
+++ b/clang/include/clang/AST/StmtOpenMP.h
@@ -21,6 +21,7 @@
 #include "clang/AST/StmtCXX.h"
 #include "clang/Basic/OpenMPKinds.h"
 #include "clang/Basic/SourceLocation.h"
+#include "llvm/Support/Casting.h"
 
 namespace clang {
 
@@ -677,6 +678,10 @@ public:
   }
 };
 
+// Forward declaration of a generic loop transformation. Used in the declaration
+// of OMPLoopBasedDirective.
+class OMPLoopTransformationDirective;
+
 /// The base class for all loop-based directives, including loop transformation
 /// directives.
 class OMPLoopBasedDirective : public OMPExecutableDirective {
@@ -889,24 +894,23 @@ public:
 
   /// Calls the specified callback function for all the loops in \p CurStmt,
   /// from the outermost to the innermost.
-  static bool doForAllLoops(
-      Stmt *CurStmt, bool TryImperfectlyNestedLoops, unsigned NumLoops,
-      llvm::function_ref<bool(unsigned, Stmt *)> Callback,
-      llvm::function_ref<void(OMPCanonicalLoopNestTransformationDirective *)>
-          OnTransformationCallback);
+  static bool
+  doForAllLoops(Stmt *CurStmt, bool TryImperfectlyNestedLoops,
+                unsigned NumLoops,
+                llvm::function_ref<bool(unsigned, Stmt *)> Callback,
+                llvm::function_ref<void(OMPLoopTransformationDirective *)>
+                    OnTransformationCallback);
   static bool
   doForAllLoops(const Stmt *CurStmt, bool TryImperfectlyNestedLoops,
                 unsigned NumLoops,
                 llvm::function_ref<bool(unsigned, const Stmt *)> Callback,
-                llvm::function_ref<
-                    void(const OMPCanonicalLoopNestTransformationDirective *)>
+                llvm::function_ref<void(const OMPLoopTransformationDirective *)>
                     OnTransformationCallback) {
     auto &&NewCallback = [Callback](unsigned Cnt, Stmt *CurStmt) {
       return Callback(Cnt, CurStmt);
     };
     auto &&NewTransformCb =
-        [OnTransformationCallback](
-            OMPCanonicalLoopNestTransformationDirective *A) {
+        [OnTransformationCallback](OMPLoopTransformationDirective *A) {
           OnTransformationCallback(A);
         };
     return doForAllLoops(const_cast<Stmt *>(CurStmt), TryImperfectlyNestedLoops,
@@ -919,7 +923,7 @@ public:
   doForAllLoops(Stmt *CurStmt, bool TryImperfectlyNestedLoops,
                 unsigned NumLoops,
                 llvm::function_ref<bool(unsigned, Stmt *)> Callback) {
-    auto &&TransformCb = [](OMPCanonicalLoopNestTransformationDirective *) {};
+    auto &&TransformCb = [](OMPLoopTransformationDirective *) {};
     return doForAllLoops(CurStmt, TryImperfectlyNestedLoops, NumLoops, Callback,
                          TransformCb);
   }
@@ -957,9 +961,11 @@ public:
 };
 
 /// Common class of data shared between
-/// OMPCanonicalLoopNestTransformationDirective and transformations over
-/// canonical loop sequences.
+/// OMPCanonicalLoopNestTransformationDirective and
+/// OMPCanonicalLoopSequenceTransformationDirective
 class OMPLoopTransformationDirective {
+  friend class ASTStmtReader;
+
   /// Number of (top-level) generated loops.
   /// This value is 1 for most transformations as they only map one loop nest
   /// into another.
@@ -969,15 +975,39 @@ class OMPLoopTransformationDirective {
   /// generate more than one loop nest, so the value would be >= 1.
   unsigned NumGeneratedTopLevelLoops = 1;
 
+  /// We need this because we cannot easily make OMPLoopTransformationDirective
+  /// a proper Stmt.
+  Stmt *S = nullptr;
+
 protected:
   void setNumGeneratedTopLevelLoops(unsigned N) {
     NumGeneratedTopLevelLoops = N;
   }
 
+  explicit OMPLoopTransformationDirective(Stmt *S) : S(S) {}
+
 public:
   unsigned getNumGeneratedTopLevelLoops() const {
     return NumGeneratedTopLevelLoops;
   }
+
+  /// Returns the specific directive related to this loop transformation.
+  Stmt *getDirective() const { return S; }
+
+  /// Get the de-sugared statements after the loop transformation.
+  ///
+  /// Might be nullptr if either the directive generates no loops and is handled
+  /// directly in CodeGen, or resolving a template-dependence context is
+  /// required.
+  Stmt *getTransformedStmt() const;
+
+  /// Return preinits statement.
+  Stmt *getPreInits() const;
+
+  static bool classof(const Stmt *T) {
+    return isa<OMPCanonicalLoopNestTransformationDirective,
+               OMPCanonicalLoopSequenceTransformationDirective>(T);
+  }
 };
 
 /// The base class for all transformation directives of canonical loop nests.
@@ -990,7 +1020,8 @@ protected:
   explicit OMPCanonicalLoopNestTransformationDirective(
       StmtClass SC, OpenMPDirectiveKind Kind, SourceLocation StartLoc,
       SourceLocation EndLoc, unsigned NumAssociatedLoops)
-      : OMPLoopBasedDirective(SC, Kind, StartLoc, EndLoc, NumAssociatedLoops) {}
+      : OMPLoopBasedDirective(SC, Kind, StartLoc, EndLoc, NumAssociatedLoops),
+        OMPLoopTransformationDirective(this) {}
 
 public:
   /// Return the number of associated (consumed) loops.
@@ -5928,6 +5959,112 @@ public:
   }
 };
 
+/// The base class for all transformation directives of canonical loop
+/// sequences (currently only 'fuse')
+class OMPCanonicalLoopSequenceTransformationDirective
+    : public OMPExecutableDirective,
+      public OMPLoopTransformationDirective {
+  friend class ASTStmtReader;
+
+protected:
+  explicit OMPCanonicalLoopSequenceTransformationDirective(
+      StmtClass SC, OpenMPDirectiveKind Kind, SourceLocation StartLoc,
+      SourceLocation EndLoc)
+      : OMPExecutableDirective(SC, Kind, StartLoc, EndLoc),
+        OMPLoopTransformationDirective(this) {}
+
+public:
+  /// Get the de-sugared statements after the loop transformation.
+  ///
+  /// Might be nullptr if either the directive generates no loops and is handled
+  /// directly in CodeGen, or resolving a template-dependence context is
+  /// required.
+  Stmt *getTransformedStmt() const;
+
+  /// Return preinits statement.
+  Stmt *getPreInits() const;
+
+  static bool classof(const Stmt *T) {
+    Stmt::StmtClass C = T->getStmtClass();
+    return C == OMPFuseDirectiveClass;
+  }
+};
+
+/// Represents the '#pragma omp fuse' loop transformation directive
+///
+/// \code{c}
+/// #pragma omp fuse
+/// {
+///   for(int i = 0; i < m1; ++i) {...}
+///   for(int j = 0; j < m2; ++j) {...}
+///   ...
+/// }
+/// \endcode
+class OMPFuseDirective final
+    : public OMPCanonicalLoopSequenceTransformationDirective {
+  friend class ASTStmtReader;
+  friend class OMPExecutableDirective;
+
+  // Offsets of child members.
+  enum {
+    PreInitsOffset = 0,
+    TransformedStmtOffset,
+  };
+
+  explicit OMPFuseDirective(SourceLocation StartLoc, SourceLocation EndLoc)
+      : OMPCanonicalLoopSequenceTransformationDirective(
+            OMPFuseDirectiveClass, llvm::omp::OMPD_fuse, StartLoc, EndLoc) {}
+
+  void setPreInits(Stmt *PreInits) {
+    Data->getChildren()[PreInitsOffset] = PreInits;
+  }
+
+  void setTransformedStmt(Stmt *S) {
+    Data->getChildren()[TransformedStmtOffset] = S;
+  }
+
+public:
+  /// Create a new AST node representation for #pragma omp fuse'
+  ///
+  /// \param C Context of the AST
+  /// \param StartLoc Location of the introducer (e.g the 'omp' token)
+  /// \param EndLoc Location of the directive's end (e.g the tok::eod)
+  /// \param Clauses The directive's clauses
+  /// \param NumLoops Total number of loops in the canonical loop sequence.
+  /// \param NumGeneratedTopLevelLoops Number of top-level generated loops.
+  //                                   Typically 1 but looprange clause can
+  //                                   change this.
+  /// \param AssociatedStmt The outermost associated loop
+  /// \param TransformedStmt The loop nest after fusion, or nullptr in
+  ///                        dependent
+  /// \param PreInits Helper preinits statements for the loop nest
+  static OMPFuseDirective *
+  Create(const ASTContext &C, SourceLocation StartLoc, SourceLocation EndLoc,
+         ArrayRef<OMPClause *> Clauses, unsigned NumGeneratedTopLevelLoops,
+         Stmt *AssociatedStmt, Stmt *TransformedStmt, Stmt *PreInits);
+
+  /// Build an empty '#pragma omp fuse' AST node for deserialization
+  ///
+  /// \param C Context of the AST
+  /// \param NumClauses Number of clauses to allocate
+  /// \param NumLoops Number of top level loops to allocate
+  static OMPFuseDirective *CreateEmpty(const ASTContext &C,
+                                       unsigned NumClauses);
+
+  /// Gets the associated loops after the transformation. This is the de-sugared
+  /// replacement or nulltpr in dependent contexts.
+  Stmt *getTransformedStmt() const {
+    return Data->getChildren()[TransformedStmtOffset];
+  }
+
+  /// Return preinits statement.
+  Stmt *getPreInits() const { return Data->getChildren()[PreInitsOffset]; }
+
+  static bool classof(const Stmt *T) {
+    return T->getStmtClass() == OMPFuseDirectiveClass;
+  }
+};
+
 /// This represents '#pragma omp scan' directive.
 ///
 /// \code
@@ -6596,4 +6733,37 @@ public:
 
 } // end namespace clang
 
+namespace llvm {
+// Allow a Stmt* be casted correctly to an OMPLoopTransformationDirective*.
+// The default routines would just use a C-style cast which won't work well
+// for the multiple inheritance here. We have to use a static cast from the
+// corresponding subclass.
+template <>
+struct CastInfo<clang::OMPLoopTransformationDirective, clang::Stmt *>
+    : public NullableValueCastFailed<clang::OMPLoopTransformationDirective *>,
+      public DefaultDoCastIfPossible<
+          clang::OMPLoopTransformationDirective *, clang::Stmt *,
+          CastInfo<clang::OMPLoopTransformationDirective, clang::Stmt *>> {
+  static bool isPossible(const clang::Stmt *T) {
+    return clang::OMPLoopTransformationDirective::classof(T);
+  }
+
+  static clang::OMPLoopTransformationDirective *doCast(clang::Stmt *T) {
+    if (auto *D =
+            dyn_cast<clang::OMPCanonicalLoopNestTransformationDirective>(T))
+      return static_cast<clang::OMPLoopTransformationDirective *>(D);
+    if (auto *D =
+            dyn_cast<clang::OMPCanonicalLoopSequenceTransformationDirective>(T))
+      return static_cast<clang::OMPLoopTransformationDirective *>(D);
+    llvm_unreachable("unexpected type");
+  }
+};
+template <>
+struct CastInfo<clang::OMPLoopTransformationDirective, const clang::Stmt *>
+    : public ConstStrippingForwardingCast<
+          clang::OMPLoopTransformationDirective, const clang::Stmt *,
+          CastInfo<clang::OMPLoopTransformationDirective, clang::Stmt *>> {};
+
+} // namespace llvm
+
 #endif
diff --git a/clang/include/clang/Basic/AMDGPUTypes.def b/clang/include/clang/Basic/AMDGPUTypes.def
index d3dff446..089a72b 100644
--- a/clang/include/clang/Basic/AMDGPUTypes.def
+++ b/clang/include/clang/Basic/AMDGPUTypes.def
@@ -21,6 +21,7 @@
 #endif
 
 AMDGPU_OPAQUE_PTR_TYPE("__amdgpu_buffer_rsrc_t", AMDGPUBufferRsrc, AMDGPUBufferRsrcTy, 128, 128, 8)
+AMDGPU_OPAQUE_PTR_TYPE("__amdgpu_texture_t", AMDGPUTexture, AMDGPUTextureTy, 256, 256, 0)
 
 AMDGPU_NAMED_BARRIER_TYPE("__amdgpu_named_workgroup_barrier_t", AMDGPUNamedWorkgroupBarrier, AMDGPUNamedWorkgroupBarrierTy, 128, 32, 0)
 
diff --git a/clang/include/clang/Basic/Builtins.def b/clang/include/clang/Basic/Builtins.def
index 9aad00b..b856ad1 100644
--- a/clang/include/clang/Basic/Builtins.def
+++ b/clang/include/clang/Basic/Builtins.def
@@ -34,6 +34,7 @@
 //  Q -> target builtin type, followed by a character to distinguish the builtin type
 //    Qa -> AArch64 svcount_t builtin type.
 //    Qb -> AMDGPU __amdgpu_buffer_rsrc_t builtin type.
+//    Qt -> AMDGPU __amdgpu_texture_t builtin type.
 //  E -> ext_vector, followed by the number of elements and the base type.
 //  X -> _Complex, followed by the base type.
 //  Y -> ptrdiff_t
diff --git a/clang/include/clang/Basic/DiagnosticParseKinds.td b/clang/include/clang/Basic/DiagnosticParseKinds.td
index 4d9e123..c724136 100644
--- a/clang/include/clang/Basic/DiagnosticParseKinds.td
+++ b/clang/include/clang/Basic/DiagnosticParseKinds.td
@@ -1141,7 +1141,7 @@ def warn_cxx23_compat_binding_pack : Warning<
 def err_capture_default_first : Error<
   "capture default must be first">;
 def ext_decl_attrs_on_lambda : ExtWarn<
-  "%select{an attribute specifier sequence|%0}1 in this position "
+  "%select{an attribute specifier sequence|%1}0 in this position "
   "is a C++23 extension">, InGroup<CXX23AttrsOnLambda>;
 def ext_lambda_missing_parens : ExtWarn<
   "lambda without a parameter clause is a C++23 extension">,
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index dc4c6d3..b157cbb 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -5770,8 +5770,10 @@ def err_template_recursion_depth_exceeded : Error<
 def err_constraint_depends_on_self
     : Error<"satisfaction of constraint %0 depends on itself">,
       NoSFINAE;
-def note_template_recursion_depth : Note<
-  "use -ftemplate-depth=N to increase recursive template instantiation depth">;
+def note_template_recursion_depth
+    : Note<"use -ftemplate-depth=N to increase recursive template "
+           "instantiation depth">,
+      NoSFINAE;
 
 def err_template_instantiate_within_definition : Error<
   "%select{implicit|explicit}0 instantiation of template %1 within its"
@@ -11761,6 +11763,18 @@ def note_omp_implicit_dsa : Note<
   "implicitly determined as %0">;
 def err_omp_loop_var_dsa : Error<
   "loop iteration variable in the associated loop of 'omp %1' directive may not be %0, predetermined as %2">;
+def err_omp_not_a_loop_sequence
+    : Error<"statement after '#pragma omp %0' must be a loop sequence "
+            "containing canonical loops or loop-generating constructs">;
+def err_omp_empty_loop_sequence
+    : Error<"loop sequence after '#pragma omp %0' must contain at least 1 "
+            "canonical loop or loop-generating construct">;
+def err_omp_invalid_looprange
+    : Error<"looprange clause selects loops from %1 to %2 but this exceeds the "
+            "number of loops (%3) in the loop sequence">;
+def warn_omp_redundant_fusion : Warning<"looprange clause selects a single "
+                                        "loop, resulting in redundant fusion">,
+                                InGroup<OpenMPClauses>;
 def err_omp_not_for : Error<
   "%select{statement after '#pragma omp %1' must be a for loop|"
   "expected %2 for loops after '#pragma omp %1'%select{|, but found only %4}3}0">;
diff --git a/clang/include/clang/Basic/OpenMPKinds.h b/clang/include/clang/Basic/OpenMPKinds.h
index 4c988e0..ed89a31 100644
--- a/clang/include/clang/Basic/OpenMPKinds.h
+++ b/clang/include/clang/Basic/OpenMPKinds.h
@@ -391,6 +391,13 @@ bool isOpenMPLoopBoundSharingDirective(OpenMPDirectiveKind Kind);
 bool isOpenMPCanonicalLoopNestTransformationDirective(
     OpenMPDirectiveKind DKind);
 
+/// Checks if the specified directive is a loop transformation directive that
+/// applies to a canonical loop sequence.
+/// \param DKind Specified directive.
+/// \return True iff the directive is a loop transformation.
+bool isOpenMPCanonicalLoopSequenceTransformationDirective(
+    OpenMPDirectiveKind DKind);
+
 /// Checks if the specified directive is a loop transformation directive.
 /// \param DKind Specified directive.
 /// \return True iff the directive is a loop transformation.
diff --git a/clang/include/clang/Basic/StmtNodes.td b/clang/include/clang/Basic/StmtNodes.td
index dd1a244..bf3686b 100644
--- a/clang/include/clang/Basic/StmtNodes.td
+++ b/clang/include/clang/Basic/StmtNodes.td
@@ -238,6 +238,10 @@ def OMPUnrollDirective : StmtNode<OMPCanonicalLoopNestTransformationDirective>;
 def OMPReverseDirective : StmtNode<OMPCanonicalLoopNestTransformationDirective>;
 def OMPInterchangeDirective
     : StmtNode<OMPCanonicalLoopNestTransformationDirective>;
+def OMPCanonicalLoopSequenceTransformationDirective
+    : StmtNode<OMPExecutableDirective, 1>;
+def OMPFuseDirective
+    : StmtNode<OMPCanonicalLoopSequenceTransformationDirective>;
 def OMPForDirective : StmtNode<OMPLoopDirective>;
 def OMPForSimdDirective : StmtNode<OMPLoopDirective>;
 def OMPSectionsDirective : StmtNode<OMPExecutableDirective>;
diff --git a/clang/include/clang/Basic/TargetInfo.h b/clang/include/clang/Basic/TargetInfo.h
index e5c5ada..ceb16174 100644
--- a/clang/include/clang/Basic/TargetInfo.h
+++ b/clang/include/clang/Basic/TargetInfo.h
@@ -1259,6 +1259,10 @@ public:
                            ArrayRef<ConstraintInfo> OutputConstraints,
                            unsigned &Index) const;
 
+  std::string
+  simplifyConstraint(StringRef Constraint,
+                     SmallVectorImpl<ConstraintInfo> *OutCons = nullptr) const;
+
   // Constraint parm will be left pointing at the last character of
   // the constraint.  In practice, it won't be changed unless the
   // constraint is longer than one character.
diff --git a/clang/include/clang/CIR/Dialect/IR/CIROps.td b/clang/include/clang/CIR/Dialect/IR/CIROps.td
index bb39444..e1be08c 100644
--- a/clang/include/clang/CIR/Dialect/IR/CIROps.td
+++ b/clang/include/clang/CIR/Dialect/IR/CIROps.td
@@ -683,8 +683,8 @@ def CIR_ConditionOp : CIR_Op<"condition", [
 //===----------------------------------------------------------------------===//
 
 defvar CIR_YieldableScopes = [
-  "ArrayCtor", "ArrayDtor", "CaseOp", "DoWhileOp", "ForOp", "IfOp", "ScopeOp",
-  "SwitchOp", "TernaryOp", "WhileOp"
+  "ArrayCtor", "ArrayDtor", "CaseOp", "DoWhileOp", "ForOp", "GlobalOp", "IfOp",
+  "ScopeOp", "SwitchOp", "TernaryOp", "WhileOp"
 ];
 
 def CIR_YieldOp : CIR_Op<"yield", [
@@ -1776,7 +1776,9 @@ def CIR_GlobalLinkageKind : CIR_I32EnumAttr<
 // is upstreamed.
 
 def CIR_GlobalOp : CIR_Op<"global", [
-  DeclareOpInterfaceMethods<CIRGlobalValueInterface>
+  DeclareOpInterfaceMethods<RegionBranchOpInterface>,
+  DeclareOpInterfaceMethods<CIRGlobalValueInterface>,
+  NoRegionArguments
 ]> {
   let summary = "Declare or define a global variable";
   let description = [{
@@ -1807,6 +1809,9 @@ def CIR_GlobalOp : CIR_Op<"global", [
                        UnitAttr:$dso_local,
                        OptionalAttr<I64Attr>:$alignment);
 
+  let regions = (region MaxSizedRegion<1>:$ctorRegion,
+                        MaxSizedRegion<1>:$dtorRegion);
+
   let assemblyFormat = [{
     ($sym_visibility^)?
     (`` $global_visibility^)?
@@ -1815,24 +1820,34 @@ def CIR_GlobalOp : CIR_Op<"global", [
     (`comdat` $comdat^)?
     (`dso_local` $dso_local^)?
     $sym_name
-    custom<GlobalOpTypeAndInitialValue>($sym_type, $initial_value)
+    custom<GlobalOpTypeAndInitialValue>($sym_type, $initial_value,
+                                        $ctorRegion, $dtorRegion)
     attr-dict
   }];
 
   let extraClassDeclaration = [{
-    bool isDeclaration() { return !getInitialValue(); }
+    bool isDeclaration() {
+      return !getInitialValue() && getCtorRegion().empty() && getDtorRegion().empty();
+    }
     bool hasInitializer() { return !isDeclaration(); }
   }];
 
   let skipDefaultBuilders = 1;
 
-  let builders = [OpBuilder<(ins
-    "llvm::StringRef":$sym_name,
-    "mlir::Type":$sym_type,
-    CArg<"bool", "false">:$isConstant,
-    // CIR defaults to external linkage.
-    CArg<"cir::GlobalLinkageKind",
-    "cir::GlobalLinkageKind::ExternalLinkage">:$linkage)>];
+  let builders = [
+    OpBuilder<(ins
+      "llvm::StringRef":$sym_name,
+      "mlir::Type":$sym_type,
+      CArg<"bool", "false">:$isConstant,
+      // CIR defaults to external linkage.
+      CArg<"cir::GlobalLinkageKind",
+           "cir::GlobalLinkageKind::ExternalLinkage">:$linkage,
+      CArg<"llvm::function_ref<void(mlir::OpBuilder &, mlir::Location)>",
+           "nullptr">:$ctorBuilder,
+      CArg<"llvm::function_ref<void(mlir::OpBuilder &, mlir::Location)>",
+           "nullptr">:$dtorBuilder)
+    >
+  ];
 
   let hasVerifier = 1;
 
diff --git a/clang/include/clang/CIR/MissingFeatures.h b/clang/include/clang/CIR/MissingFeatures.h
index 0fac1b2..7e59989 100644
--- a/clang/include/clang/CIR/MissingFeatures.h
+++ b/clang/include/clang/CIR/MissingFeatures.h
@@ -208,6 +208,7 @@ struct MissingFeatures {
   static bool dataLayoutTypeAllocSize() { return false; }
   static bool dataLayoutTypeStoreSize() { return false; }
   static bool deferredCXXGlobalInit() { return false; }
+  static bool deleteArray() { return false; }
   static bool devirtualizeMemberFunction() { return false; }
   static bool ehCleanupFlags() { return false; }
   static bool ehCleanupScope() { return false; }
@@ -219,6 +220,7 @@ struct MissingFeatures {
   static bool emitCondLikelihoodViaExpectIntrinsic() { return false; }
   static bool emitLifetimeMarkers() { return false; }
   static bool emitLValueAlignmentAssumption() { return false; }
+  static bool emitNullCheckForDeleteCalls() { return false; }
   static bool emitNullabilityCheck() { return false; }
   static bool emitTypeCheck() { return false; }
   static bool emitTypeMetadataCodeForVCall() { return false; }
diff --git a/clang/include/clang/Parse/Parser.h b/clang/include/clang/Parse/Parser.h
index 30edd30..e301cf1 100644
--- a/clang/include/clang/Parse/Parser.h
+++ b/clang/include/clang/Parse/Parser.h
@@ -6767,6 +6767,9 @@ private:
                                                 OpenMPClauseKind Kind,
                                                 bool ParseOnly);
 
+  /// Parses the 'looprange' clause of a '#pragma omp fuse' directive.
+  OMPClause *ParseOpenMPLoopRangeClause();
+
   /// Parses the 'sizes' clause of a '#pragma omp tile' directive.
   OMPClause *ParseOpenMPSizesClause();
 
diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
index 2bd6be2..f53aafd 100644
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -13335,8 +13335,6 @@ public:
     Sema &SemaRef;
     bool Invalid;
     bool AlreadyInstantiating;
-    bool CheckInstantiationDepth(SourceLocation PointOfInstantiation,
-                                 SourceRange InstantiationRange);
 
     InstantiatingTemplate(Sema &SemaRef,
                           CodeSynthesisContext::SynthesisKind Kind,
@@ -13529,7 +13527,7 @@ public:
     ~ArgPackSubstIndexRAII() { Self.ArgPackSubstIndex = OldSubstIndex; }
   };
 
-  void pushCodeSynthesisContext(CodeSynthesisContext Ctx);
+  bool pushCodeSynthesisContext(CodeSynthesisContext Ctx);
   void popCodeSynthesisContext();
 
   void PrintContextStack(InstantiationContextDiagFuncRef DiagFunc) {
diff --git a/clang/include/clang/Sema/SemaOpenMP.h b/clang/include/clang/Sema/SemaOpenMP.h
index c0fd7a6..daf58b1 100644
--- a/clang/include/clang/Sema/SemaOpenMP.h
+++ b/clang/include/clang/Sema/SemaOpenMP.h
@@ -463,6 +463,13 @@ public:
                                              Stmt *AStmt,
                                              SourceLocation StartLoc,
                                              SourceLocation EndLoc);
+
+  /// Called on well-formed '#pragma omp fuse' after parsing of its
+  /// clauses and the associated statement.
+  StmtResult ActOnOpenMPFuseDirective(ArrayRef<OMPClause *> Clauses,
+                                      Stmt *AStmt, SourceLocation StartLoc,
+                                      SourceLocation EndLoc);
+
   /// Called on well-formed '\#pragma omp for' after parsing
   /// of the associated statement.
   StmtResult
@@ -921,6 +928,12 @@ public:
                                        SourceLocation StartLoc,
                                        SourceLocation LParenLoc,
                                        SourceLocation EndLoc);
+
+  /// Called on well-form 'looprange' clause after parsing its arguments.
+  OMPClause *
+  ActOnOpenMPLoopRangeClause(Expr *First, Expr *Count, SourceLocation StartLoc,
+                             SourceLocation LParenLoc, SourceLocation FirstLoc,
+                             SourceLocation CountLoc, SourceLocation EndLoc);
   /// Called on well-formed 'ordered' clause.
   OMPClause *
   ActOnOpenMPOrderedClause(SourceLocation StartLoc, SourceLocation EndLoc,
@@ -1485,7 +1498,81 @@ private:
   bool checkTransformableLoopNest(
       OpenMPDirectiveKind Kind, Stmt *AStmt, int NumLoops,
       SmallVectorImpl<OMPLoopBasedDirective::HelperExprs> &LoopHelpers,
-      Stmt *&Body, SmallVectorImpl<SmallVector<Stmt *, 0>> &OriginalInits);
+      Stmt *&Body, SmallVectorImpl<SmallVector<Stmt *>> &OriginalInits);
+
+  /// Holds the result of the analysis of a (possibly canonical) loop.
+  struct LoopAnalysis {
+    /// The analyzed loop or loop transformation.
+    Stmt *AStmt = nullptr;
+    /// Loop analyses results.
+    OMPLoopBasedDirective::HelperExprs HelperExprs;
+    /// The for-statement of the loop. TheForStmt equals AStmt only when the
+    /// latter is a canonical loop (i.e. not a loop transformation).
+    Stmt *TheForStmt = nullptr;
+    /// Initialization statements before transformations.
+    SmallVector<Stmt *> OriginalInits;
+    /// Initialization statements required after transformation of this loop.
+    SmallVector<Stmt *> TransformsPreInits;
+
+    explicit LoopAnalysis(Stmt *S) : AStmt(S) {}
+
+    bool isRegularLoop() const { return isRegularLoop(AStmt); }
+    bool isLoopTransformation() const { return isLoopTransformation(AStmt); }
+
+    // Convenience functions used when building LoopSequenceAnalysis.
+    static bool isRegularLoop(Stmt *S) {
+      return isa<ForStmt, CXXForRangeStmt>(S);
+    }
+    static bool isLoopTransformation(Stmt *S) {
+      return isa<OMPLoopTransformationDirective>(S);
+    }
+  };
+
+  /// Holds the result of the analysis of a (possibly canonical) loop sequence.
+  struct LoopSequenceAnalysis {
+    /// Number of top level canonical loops.
+    unsigned LoopSeqSize = 0;
+    /// For each loop results of the analysis.
+    SmallVector<LoopAnalysis, 2> Loops;
+    /// Additional code required before entering the transformed loop sequence.
+    SmallVector<Stmt *> LoopSequencePreInits;
+
+    // Convenience function used when building the LoopSequenceAnalysis.
+    static bool isLoopSequenceDerivation(Stmt *S) {
+      return LoopAnalysis::isRegularLoop(S) ||
+             LoopAnalysis::isLoopTransformation(S);
+    }
+  };
+
+  /// The main recursive process of `checkTransformableLoopSequence` that
+  /// performs grammatical parsing of a canonical loop sequence. It extracts
+  /// key information, such as the number of top-level loops, loop statements,
+  /// helper expressions, and other relevant loop-related data, all in a single
+  /// execution to avoid redundant traversals. This analysis flattens inner
+  /// Loop Sequences
+  ///
+  /// \param LoopSeqStmt    The AST of the original statement.
+  /// \param SeqAnalysis    [out] Result of the analysis of \p LoopSeqStmt
+  /// \param Context
+  /// \param Kind           The loop transformation directive kind.
+  /// \return Whether the original statement is both syntactically and
+  /// semantically correct according to OpenMP 6.0 canonical loop
+  /// sequence definition.
+  bool analyzeLoopSequence(Stmt *LoopSeqStmt, LoopSequenceAnalysis &SeqAnalysis,
+                           ASTContext &Context, OpenMPDirectiveKind Kind);
+
+  /// Validates and checks whether a loop sequence can be transformed according
+  /// to the given directive, providing necessary setup and initialization
+  /// (Driver function) before recursion using `analyzeLoopSequence`.
+  ///
+  /// \param Kind           The loop transformation directive kind.
+  /// \param AStmt          The AST of the original statement
+  /// \param SeqAnalysis    [out] Result of the analysis of \p LoopSeqStmt
+  /// \param Context
+  /// \return Whether there was an absence of errors or not
+  bool checkTransformableLoopSequence(OpenMPDirectiveKind Kind, Stmt *AStmt,
+                                      LoopSequenceAnalysis &SeqAnalysis,
+                                      ASTContext &Context);
 
   /// Helper to keep information about the current `omp begin/end declare
   /// variant` nesting.
diff --git a/clang/include/clang/Serialization/ASTBitCodes.h b/clang/include/clang/Serialization/ASTBitCodes.h
index 441047d..99864c7 100644
--- a/clang/include/clang/Serialization/ASTBitCodes.h
+++ b/clang/include/clang/Serialization/ASTBitCodes.h
@@ -1951,6 +1951,7 @@ enum StmtCode {
   STMT_OMP_UNROLL_DIRECTIVE,
   STMT_OMP_REVERSE_DIRECTIVE,
   STMT_OMP_INTERCHANGE_DIRECTIVE,
+  STMT_OMP_FUSE_DIRECTIVE,
   STMT_OMP_FOR_DIRECTIVE,
   STMT_OMP_FOR_SIMD_DIRECTIVE,
   STMT_OMP_SECTIONS_DIRECTIVE,
diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp
index 61dd330..0fd0e7e 100644
--- a/clang/lib/AST/ASTContext.cpp
+++ b/clang/lib/AST/ASTContext.cpp
@@ -12590,6 +12590,10 @@ static QualType DecodeTypeFromStr(const char *&Str, const ASTContext &Context,
       Type = Context.AMDGPUBufferRsrcTy;
       break;
     }
+    case 't': {
+      Type = Context.AMDGPUTextureTy;
+      break;
+    }
     default:
       llvm_unreachable("Unexpected target builtin type");
     }
diff --git a/clang/lib/AST/ByteCode/Compiler.cpp b/clang/lib/AST/ByteCode/Compiler.cpp
index b4da999..0b7b6cd 100644
--- a/clang/lib/AST/ByteCode/Compiler.cpp
+++ b/clang/lib/AST/ByteCode/Compiler.cpp
@@ -2934,8 +2934,9 @@ bool Compiler<Emitter>::VisitMaterializeTemporaryExpr(
   // For everyhing else, use local variables.
   if (SubExprT) {
     bool IsConst = SubExpr->getType().isConstQualified();
-    unsigned LocalIndex =
-        allocateLocalPrimitive(E, *SubExprT, IsConst, E->getExtendingDecl());
+    bool IsVolatile = SubExpr->getType().isVolatileQualified();
+    unsigned LocalIndex = allocateLocalPrimitive(
+        E, *SubExprT, IsConst, IsVolatile, E->getExtendingDecl());
     if (!this->visit(SubExpr))
       return false;
     if (!this->emitSetLocal(*SubExprT, LocalIndex, E))
@@ -4452,6 +4453,9 @@ bool Compiler<Emitter>::visitAssignment(const Expr *LHS, const Expr *RHS,
   if (!this->visit(LHS))
     return false;
 
+  if (LHS->getType().isVolatileQualified())
+    return this->emitInvalidStore(LHS->getType().getTypePtr(), E);
+
   // We don't support assignments in C.
   if (!Ctx.getLangOpts().CPlusPlus && !this->emitInvalid(E))
     return false;
@@ -4560,13 +4564,14 @@ bool Compiler<Emitter>::emitConst(const APSInt &Value, const Expr *E) {
 
 template <class Emitter>
 unsigned Compiler<Emitter>::allocateLocalPrimitive(
-    DeclTy &&Src, PrimType Ty, bool IsConst, const ValueDecl *ExtendingDecl,
-    ScopeKind SC, bool IsConstexprUnknown) {
+    DeclTy &&Src, PrimType Ty, bool IsConst, bool IsVolatile,
+    const ValueDecl *ExtendingDecl, ScopeKind SC, bool IsConstexprUnknown) {
   // FIXME: There are cases where Src.is<Expr*>() is wrong, e.g.
   //   (int){12} in C. Consider using Expr::isTemporaryObject() instead
   //   or isa<MaterializeTemporaryExpr>().
   Descriptor *D = P.createDescriptor(Src, Ty, nullptr, Descriptor::InlineDescMD,
-                                     IsConst, isa<const Expr *>(Src));
+                                     IsConst, isa<const Expr *>(Src),
+                                     /*IsMutable=*/false, IsVolatile);
   D->IsConstexprUnknown = IsConstexprUnknown;
   Scope::Local Local = this->createLocal(D);
   if (auto *VD = dyn_cast_if_present<ValueDecl>(Src.dyn_cast<const Decl *>()))
@@ -4874,7 +4879,8 @@ Compiler<Emitter>::visitVarDecl(const VarDecl *VD, const Expr *Init,
 
   if (VarT) {
     unsigned Offset = this->allocateLocalPrimitive(
-        VD, *VarT, VD->getType().isConstQualified(), nullptr, ScopeKind::Block,
+        VD, *VarT, VD->getType().isConstQualified(),
+        VD->getType().isVolatileQualified(), nullptr, ScopeKind::Block,
         IsConstexprUnknown);
     if (Init) {
       // If this is a toplevel declaration, create a scope for the
diff --git a/clang/lib/AST/ByteCode/Compiler.h b/clang/lib/AST/ByteCode/Compiler.h
index 09599b3..5c46f75 100644
--- a/clang/lib/AST/ByteCode/Compiler.h
+++ b/clang/lib/AST/ByteCode/Compiler.h
@@ -327,6 +327,7 @@ protected:
 
   /// Creates a local primitive value.
   unsigned allocateLocalPrimitive(DeclTy &&Decl, PrimType Ty, bool IsConst,
+                                  bool IsVolatile = false,
                                   const ValueDecl *ExtendingDecl = nullptr,
                                   ScopeKind SC = ScopeKind::Block,
                                   bool IsConstexprUnknown = false);
diff --git a/clang/lib/AST/ByteCode/Context.cpp b/clang/lib/AST/ByteCode/Context.cpp
index 306f95c..683e916 100644
--- a/clang/lib/AST/ByteCode/Context.cpp
+++ b/clang/lib/AST/ByteCode/Context.cpp
@@ -567,9 +567,15 @@ const Function *Context::getOrCreateFunction(const FunctionDecl *FuncDecl) {
   // Assign descriptors to all parameters.
   // Composite objects are lowered to pointers.
   for (const ParmVarDecl *PD : FuncDecl->parameters()) {
+    bool IsConst = PD->getType().isConstQualified();
+    bool IsVolatile = PD->getType().isVolatileQualified();
+
     OptPrimType T = classify(PD->getType());
     PrimType PT = T.value_or(PT_Ptr);
-    Descriptor *Desc = P->createDescriptor(PD, PT);
+    Descriptor *Desc = P->createDescriptor(PD, PT, nullptr, std::nullopt,
+                                           IsConst, /*IsTemporary=*/false,
+                                           /*IsMutable=*/false, IsVolatile);
+
     ParamDescriptors.insert({ParamOffset, {PT, Desc}});
     ParamOffsets.push_back(ParamOffset);
     ParamOffset += align(primSize(PT));
@@ -595,9 +601,14 @@ const Function *Context::getOrCreateObjCBlock(const BlockExpr *E) {
   // Assign descriptors to all parameters.
   // Composite objects are lowered to pointers.
   for (const ParmVarDecl *PD : BD->parameters()) {
+    bool IsConst = PD->getType().isConstQualified();
+    bool IsVolatile = PD->getType().isVolatileQualified();
+
     OptPrimType T = classify(PD->getType());
     PrimType PT = T.value_or(PT_Ptr);
-    Descriptor *Desc = P->createDescriptor(PD, PT);
+    Descriptor *Desc = P->createDescriptor(PD, PT, nullptr, std::nullopt,
+                                           IsConst, /*IsTemporary=*/false,
+                                           /*IsMutable=*/false, IsVolatile);
     ParamDescriptors.insert({ParamOffset, {PT, Desc}});
     ParamOffsets.push_back(ParamOffset);
     ParamOffset += align(primSize(PT));
diff --git a/clang/lib/AST/ByteCode/Interp.cpp b/clang/lib/AST/ByteCode/Interp.cpp
index 8aaefc7..21af3d6 100644
--- a/clang/lib/AST/ByteCode/Interp.cpp
+++ b/clang/lib/AST/ByteCode/Interp.cpp
@@ -889,6 +889,8 @@ bool CheckStore(InterpState &S, CodePtr OpPC, const Pointer &Ptr) {
     return false;
   if (!CheckConst(S, OpPC, Ptr))
     return false;
+  if (!CheckVolatile(S, OpPC, Ptr, AK_Assign))
+    return false;
   if (!S.inConstantContext() && isConstexprUnknown(Ptr))
     return false;
   return true;
diff --git a/clang/lib/AST/ByteCode/Interp.h b/clang/lib/AST/ByteCode/Interp.h
index 7867a06..bb0c458 100644
--- a/clang/lib/AST/ByteCode/Interp.h
+++ b/clang/lib/AST/ByteCode/Interp.h
@@ -1730,9 +1730,8 @@ inline bool GetPtrLocal(InterpState &S, CodePtr OpPC, uint32_t I) {
 }
 
 inline bool GetPtrParam(InterpState &S, CodePtr OpPC, uint32_t I) {
-  if (S.checkingPotentialConstantExpression()) {
+  if (S.Current->isBottomFrame())
     return false;
-  }
   S.Stk.push<Pointer>(S.Current->getParamPointer(I));
   return true;
 }
@@ -3344,6 +3343,18 @@ inline bool InvalidCast(InterpState &S, CodePtr OpPC, CastKind Kind,
   return false;
 }
 
+inline bool InvalidStore(InterpState &S, CodePtr OpPC, const Type *T) {
+  if (S.getLangOpts().CPlusPlus) {
+    QualType VolatileType = QualType(T, 0).withVolatile();
+    S.FFDiag(S.Current->getSource(OpPC),
+             diag::note_constexpr_access_volatile_type)
+        << AK_Assign << VolatileType;
+  } else {
+    S.FFDiag(S.Current->getSource(OpPC));
+  }
+  return false;
+}
+
 inline bool InvalidDeclRef(InterpState &S, CodePtr OpPC, const DeclRefExpr *DR,
                            bool InitializerFailed) {
   assert(DR);
diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
index 891344d..a2e97fc 100644
--- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp
+++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
@@ -1294,95 +1294,6 @@ static bool interp__builtin_assume_aligned(InterpState &S, CodePtr OpPC,
   return true;
 }
 
-static bool interp__builtin_ia32_bextr(InterpState &S, CodePtr OpPC,
-                                       const InterpFrame *Frame,
-                                       const CallExpr *Call) {
-  if (Call->getNumArgs() != 2 || !Call->getArg(0)->getType()->isIntegerType() ||
-      !Call->getArg(1)->getType()->isIntegerType())
-    return false;
-
-  APSInt Index = popToAPSInt(S, Call->getArg(1));
-  APSInt Val = popToAPSInt(S, Call->getArg(0));
-
-  unsigned BitWidth = Val.getBitWidth();
-  uint64_t Shift = Index.extractBitsAsZExtValue(8, 0);
-  uint64_t Length = Index.extractBitsAsZExtValue(8, 8);
-  Length = Length > BitWidth ? BitWidth : Length;
-
-  // Handle out of bounds cases.
-  if (Length == 0 || Shift >= BitWidth) {
-    pushInteger(S, 0, Call->getType());
-    return true;
-  }
-
-  uint64_t Result = Val.getZExtValue() >> Shift;
-  Result &= llvm::maskTrailingOnes<uint64_t>(Length);
-  pushInteger(S, Result, Call->getType());
-  return true;
-}
-
-static bool interp__builtin_ia32_bzhi(InterpState &S, CodePtr OpPC,
-                                      const InterpFrame *Frame,
-                                      const CallExpr *Call) {
-  QualType CallType = Call->getType();
-  if (Call->getNumArgs() != 2 || !Call->getArg(0)->getType()->isIntegerType() ||
-      !Call->getArg(1)->getType()->isIntegerType() ||
-      !CallType->isIntegerType())
-    return false;
-
-  APSInt Idx = popToAPSInt(S, Call->getArg(1));
-  APSInt Val = popToAPSInt(S, Call->getArg(0));
-
-  unsigned BitWidth = Val.getBitWidth();
-  uint64_t Index = Idx.extractBitsAsZExtValue(8, 0);
-
-  if (Index < BitWidth)
-    Val.clearHighBits(BitWidth - Index);
-
-  pushInteger(S, Val, CallType);
-  return true;
-}
-
-static bool interp__builtin_ia32_pdep(InterpState &S, CodePtr OpPC,
-                                      const InterpFrame *Frame,
-                                      const CallExpr *Call) {
-  if (Call->getNumArgs() != 2 || !Call->getArg(0)->getType()->isIntegerType() ||
-      !Call->getArg(1)->getType()->isIntegerType())
-    return false;
-
-  APSInt Mask = popToAPSInt(S, Call->getArg(1));
-  APSInt Val = popToAPSInt(S, Call->getArg(0));
-
-  unsigned BitWidth = Val.getBitWidth();
-  APInt Result = APInt::getZero(BitWidth);
-  for (unsigned I = 0, P = 0; I != BitWidth; ++I) {
-    if (Mask[I])
-      Result.setBitVal(I, Val[P++]);
-  }
-  pushInteger(S, std::move(Result), Call->getType());
-  return true;
-}
-
-static bool interp__builtin_ia32_pext(InterpState &S, CodePtr OpPC,
-                                      const InterpFrame *Frame,
-                                      const CallExpr *Call) {
-  if (Call->getNumArgs() != 2 || !Call->getArg(0)->getType()->isIntegerType() ||
-      !Call->getArg(1)->getType()->isIntegerType())
-    return false;
-
-  APSInt Mask = popToAPSInt(S, Call->getArg(1));
-  APSInt Val = popToAPSInt(S, Call->getArg(0));
-
-  unsigned BitWidth = Val.getBitWidth();
-  APInt Result = APInt::getZero(BitWidth);
-  for (unsigned I = 0, P = 0; I != BitWidth; ++I) {
-    if (Mask[I])
-      Result.setBitVal(P++, Val[I]);
-  }
-  pushInteger(S, std::move(Result), Call->getType());
-  return true;
-}
-
 /// (CarryIn, LHS, RHS, Result)
 static bool interp__builtin_ia32_addcarry_subborrow(InterpState &S,
                                                     CodePtr OpPC,
@@ -3275,11 +3186,37 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
   case clang::X86::BI__builtin_ia32_bextr_u64:
   case clang::X86::BI__builtin_ia32_bextri_u32:
   case clang::X86::BI__builtin_ia32_bextri_u64:
-    return interp__builtin_ia32_bextr(S, OpPC, Frame, Call);
+    return interp__builtin_elementwise_int_binop(
+        S, OpPC, Call, [](const APSInt &Val, const APSInt &Idx) {
+          unsigned BitWidth = Val.getBitWidth();
+          uint64_t Shift = Idx.extractBitsAsZExtValue(8, 0);
+          uint64_t Length = Idx.extractBitsAsZExtValue(8, 8);
+          if (Length > BitWidth) {
+            Length = BitWidth;
+          }
+
+          // Handle out of bounds cases.
+          if (Length == 0 || Shift >= BitWidth)
+            return APInt(BitWidth, 0);
+
+          uint64_t Result = Val.getZExtValue() >> Shift;
+          Result &= llvm::maskTrailingOnes<uint64_t>(Length);
+          return APInt(BitWidth, Result);
+        });
 
   case clang::X86::BI__builtin_ia32_bzhi_si:
   case clang::X86::BI__builtin_ia32_bzhi_di:
-    return interp__builtin_ia32_bzhi(S, OpPC, Frame, Call);
+    return interp__builtin_elementwise_int_binop(
+        S, OpPC, Call, [](const APSInt &Val, const APSInt &Idx) {
+          unsigned BitWidth = Val.getBitWidth();
+          uint64_t Index = Idx.extractBitsAsZExtValue(8, 0);
+          APSInt Result = Val;
+
+          if (Index < BitWidth)
+            Result.clearHighBits(BitWidth - Index);
+
+          return Result;
+        });
 
   case clang::X86::BI__builtin_ia32_lzcnt_u16:
   case clang::X86::BI__builtin_ia32_lzcnt_u32:
@@ -3299,11 +3236,33 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
 
   case clang::X86::BI__builtin_ia32_pdep_si:
   case clang::X86::BI__builtin_ia32_pdep_di:
-    return interp__builtin_ia32_pdep(S, OpPC, Frame, Call);
+    return interp__builtin_elementwise_int_binop(
+        S, OpPC, Call, [](const APSInt &Val, const APSInt &Mask) {
+          unsigned BitWidth = Val.getBitWidth();
+          APInt Result = APInt::getZero(BitWidth);
+
+          for (unsigned I = 0, P = 0; I != BitWidth; ++I) {
+            if (Mask[I])
+              Result.setBitVal(I, Val[P++]);
+          }
+
+          return Result;
+        });
 
   case clang::X86::BI__builtin_ia32_pext_si:
   case clang::X86::BI__builtin_ia32_pext_di:
-    return interp__builtin_ia32_pext(S, OpPC, Frame, Call);
+    return interp__builtin_elementwise_int_binop(
+        S, OpPC, Call, [](const APSInt &Val, const APSInt &Mask) {
+          unsigned BitWidth = Val.getBitWidth();
+          APInt Result = APInt::getZero(BitWidth);
+
+          for (unsigned I = 0, P = 0; I != BitWidth; ++I) {
+            if (Mask[I])
+              Result.setBitVal(P++, Val[I]);
+          }
+
+          return Result;
+        });
 
   case clang::X86::BI__builtin_ia32_addcarryx_u32:
   case clang::X86::BI__builtin_ia32_addcarryx_u64:
diff --git a/clang/lib/AST/ByteCode/InterpFrame.cpp b/clang/lib/AST/ByteCode/InterpFrame.cpp
index a3db0d7..039acb5 100644
--- a/clang/lib/AST/ByteCode/InterpFrame.cpp
+++ b/clang/lib/AST/ByteCode/InterpFrame.cpp
@@ -231,6 +231,8 @@ Pointer InterpFrame::getParamPointer(unsigned Off) {
   if (auto Pt = Params.find(Off); Pt != Params.end())
     return Pointer(reinterpret_cast<Block *>(Pt->second.get()));
 
+  assert(!isBottomFrame());
+
   // Allocate memory to store the parameter and the block metadata.
   const auto &Desc = Func->getParamDescriptor(Off);
   size_t BlockSize = sizeof(Block) + Desc.second->getAllocSize();
diff --git a/clang/lib/AST/ByteCode/Opcodes.td b/clang/lib/AST/ByteCode/Opcodes.td
index 7af2df5..532c444 100644
--- a/clang/lib/AST/ByteCode/Opcodes.td
+++ b/clang/lib/AST/ByteCode/Opcodes.td
@@ -797,6 +797,7 @@ def SideEffect : Opcode {}
 def InvalidCast : Opcode {
   let Args = [ArgCastKind, ArgBool];
 }
+def InvalidStore : Opcode { let Args = [ArgTypePtr]; }
 def CheckPseudoDtor : Opcode {}
 
 def InvalidDeclRef : Opcode {
diff --git a/clang/lib/AST/ByteCode/Pointer.h b/clang/lib/AST/ByteCode/Pointer.h
index af89b66..cd738ce 100644
--- a/clang/lib/AST/ByteCode/Pointer.h
+++ b/clang/lib/AST/ByteCode/Pointer.h
@@ -262,6 +262,7 @@ public:
     case Storage::Typeid:
       return false;
     }
+    llvm_unreachable("Unknown clang::interp::Storage enum");
   }
   /// Checks if the pointer is live.
   bool isLive() const {
diff --git a/clang/lib/AST/OpenMPClause.cpp b/clang/lib/AST/OpenMPClause.cpp
index 55b93e1..2ce4419 100644
--- a/clang/lib/AST/OpenMPClause.cpp
+++ b/clang/lib/AST/OpenMPClause.cpp
@@ -1024,6 +1024,26 @@ OMPPartialClause *OMPPartialClause::CreateEmpty(const ASTContext &C) {
   return new (C) OMPPartialClause();
 }
 
+OMPLoopRangeClause *
+OMPLoopRangeClause::Create(const ASTContext &C, SourceLocation StartLoc,
+                           SourceLocation LParenLoc, SourceLocation FirstLoc,
+                           SourceLocation CountLoc, SourceLocation EndLoc,
+                           Expr *First, Expr *Count) {
+  OMPLoopRangeClause *Clause = CreateEmpty(C);
+  Clause->setLocStart(StartLoc);
+  Clause->setLParenLoc(LParenLoc);
+  Clause->setFirstLoc(FirstLoc);
+  Clause->setCountLoc(CountLoc);
+  Clause->setLocEnd(EndLoc);
+  Clause->setFirst(First);
+  Clause->setCount(Count);
+  return Clause;
+}
+
+OMPLoopRangeClause *OMPLoopRangeClause::CreateEmpty(const ASTContext &C) {
+  return new (C) OMPLoopRangeClause();
+}
+
 OMPAllocateClause *OMPAllocateClause::Create(
     const ASTContext &C, SourceLocation StartLoc, SourceLocation LParenLoc,
     Expr *Allocator, Expr *Alignment, SourceLocation ColonLoc,
@@ -1964,6 +1984,21 @@ void OMPClausePrinter::VisitOMPPartialClause(OMPPartialClause *Node) {
   }
 }
 
+void OMPClausePrinter::VisitOMPLoopRangeClause(OMPLoopRangeClause *Node) {
+  OS << "looprange";
+
+  Expr *First = Node->getFirst();
+  Expr *Count = Node->getCount();
+
+  if (First && Count) {
+    OS << "(";
+    First->printPretty(OS, nullptr, Policy, 0);
+    OS << ",";
+    Count->printPretty(OS, nullptr, Policy, 0);
+    OS << ")";
+  }
+}
+
 void OMPClausePrinter::VisitOMPAllocatorClause(OMPAllocatorClause *Node) {
   OS << "allocator(";
   Node->getAllocator()->printPretty(OS, nullptr, Policy, 0);
diff --git a/clang/lib/AST/StmtOpenMP.cpp b/clang/lib/AST/StmtOpenMP.cpp
index 1f6586f..a5b0cd3 100644
--- a/clang/lib/AST/StmtOpenMP.cpp
+++ b/clang/lib/AST/StmtOpenMP.cpp
@@ -125,13 +125,12 @@ OMPLoopBasedDirective::tryToFindNextInnerLoop(Stmt *CurStmt,
 bool OMPLoopBasedDirective::doForAllLoops(
     Stmt *CurStmt, bool TryImperfectlyNestedLoops, unsigned NumLoops,
     llvm::function_ref<bool(unsigned, Stmt *)> Callback,
-    llvm::function_ref<void(OMPCanonicalLoopNestTransformationDirective *)>
+    llvm::function_ref<void(OMPLoopTransformationDirective *)>
         OnTransformationCallback) {
   CurStmt = CurStmt->IgnoreContainers();
   for (unsigned Cnt = 0; Cnt < NumLoops; ++Cnt) {
     while (true) {
-      auto *Dir =
-          dyn_cast<OMPCanonicalLoopNestTransformationDirective>(CurStmt);
+      auto *Dir = dyn_cast<OMPLoopTransformationDirective>(CurStmt);
       if (!Dir)
         break;
 
@@ -371,6 +370,22 @@ OMPForDirective *OMPForDirective::Create(
   return Dir;
 }
 
+Stmt *OMPLoopTransformationDirective::getTransformedStmt() const {
+  if (auto *D = dyn_cast<OMPCanonicalLoopNestTransformationDirective>(S))
+    return D->getTransformedStmt();
+  if (auto *D = dyn_cast<OMPCanonicalLoopSequenceTransformationDirective>(S))
+    return D->getTransformedStmt();
+  llvm_unreachable("unexpected object type");
+}
+
+Stmt *OMPLoopTransformationDirective::getPreInits() const {
+  if (auto *D = dyn_cast<OMPCanonicalLoopNestTransformationDirective>(S))
+    return D->getPreInits();
+  if (auto *D = dyn_cast<OMPCanonicalLoopSequenceTransformationDirective>(S))
+    return D->getPreInits();
+  llvm_unreachable("unexpected object type");
+}
+
 Stmt *OMPCanonicalLoopNestTransformationDirective::getTransformedStmt() const {
   switch (getStmtClass()) {
 #define STMT(CLASS, PARENT)
@@ -380,7 +395,7 @@ Stmt *OMPCanonicalLoopNestTransformationDirective::getTransformedStmt() const {
     return static_cast<const CLASS *>(this)->getTransformedStmt();
 #include "clang/AST/StmtNodes.inc"
   default:
-    llvm_unreachable("Not a loop transformation");
+    llvm_unreachable("Not a loop transformation for canonical loop nests");
   }
 }
 
@@ -393,7 +408,34 @@ Stmt *OMPCanonicalLoopNestTransformationDirective::getPreInits() const {
     return static_cast<const CLASS *>(this)->getPreInits();
 #include "clang/AST/StmtNodes.inc"
   default:
-    llvm_unreachable("Not a loop transformation");
+    llvm_unreachable("Not a loop transformation for canonical loop nests");
+  }
+}
+
+Stmt *
+OMPCanonicalLoopSequenceTransformationDirective::getTransformedStmt() const {
+  switch (getStmtClass()) {
+#define STMT(CLASS, PARENT)
+#define ABSTRACT_STMT(CLASS)
+#define OMPCANONICALLOOPSEQUENCETRANSFORMATIONDIRECTIVE(CLASS, PARENT)         \
+  case Stmt::CLASS##Class:                                                     \
+    return static_cast<const CLASS *>(this)->getTransformedStmt();
+#include "clang/AST/StmtNodes.inc"
+  default:
+    llvm_unreachable("Not a loop transformation for canonical loop sequences");
+  }
+}
+
+Stmt *OMPCanonicalLoopSequenceTransformationDirective::getPreInits() const {
+  switch (getStmtClass()) {
+#define STMT(CLASS, PARENT)
+#define ABSTRACT_STMT(CLASS)
+#define OMPCANONICALLOOPSEQUENCETRANSFORMATIONDIRECTIVE(CLASS, PARENT)         \
+  case Stmt::CLASS##Class:                                                     \
+    return static_cast<const CLASS *>(this)->getPreInits();
+#include "clang/AST/StmtNodes.inc"
+  default:
+    llvm_unreachable("Not a loop transformation for canonical loop sequences");
   }
 }
 
@@ -510,6 +552,27 @@ OMPInterchangeDirective::CreateEmpty(const ASTContext &C, unsigned NumClauses,
       SourceLocation(), SourceLocation(), NumLoops);
 }
 
+OMPFuseDirective *OMPFuseDirective::Create(
+    const ASTContext &C, SourceLocation StartLoc, SourceLocation EndLoc,
+    ArrayRef<OMPClause *> Clauses, unsigned NumGeneratedTopLevelLoops,
+    Stmt *AssociatedStmt, Stmt *TransformedStmt, Stmt *PreInits) {
+
+  OMPFuseDirective *Dir = createDirective<OMPFuseDirective>(
+      C, Clauses, AssociatedStmt, TransformedStmtOffset + 1, StartLoc, EndLoc);
+  Dir->setTransformedStmt(TransformedStmt);
+  Dir->setPreInits(PreInits);
+  Dir->setNumGeneratedTopLevelLoops(NumGeneratedTopLevelLoops);
+  return Dir;
+}
+
+OMPFuseDirective *OMPFuseDirective::CreateEmpty(const ASTContext &C,
+                                                unsigned NumClauses) {
+  OMPFuseDirective *Dir = createEmptyDirective<OMPFuseDirective>(
+      C, NumClauses, /*HasAssociatedStmt=*/true, TransformedStmtOffset + 1,
+      SourceLocation(), SourceLocation());
+  return Dir;
+}
+
 OMPForSimdDirective *
 OMPForSimdDirective::Create(const ASTContext &C, SourceLocation StartLoc,
                             SourceLocation EndLoc, unsigned CollapsedNum,
diff --git a/clang/lib/AST/StmtPrinter.cpp b/clang/lib/AST/StmtPrinter.cpp
index 2c9c358..586c300 100644
--- a/clang/lib/AST/StmtPrinter.cpp
+++ b/clang/lib/AST/StmtPrinter.cpp
@@ -795,6 +795,11 @@ void StmtPrinter::VisitOMPInterchangeDirective(OMPInterchangeDirective *Node) {
   PrintOMPExecutableDirective(Node);
 }
 
+void StmtPrinter::VisitOMPFuseDirective(OMPFuseDirective *Node) {
+  Indent() << "#pragma omp fuse";
+  PrintOMPExecutableDirective(Node);
+}
+
 void StmtPrinter::VisitOMPForDirective(OMPForDirective *Node) {
   Indent() << "#pragma omp for";
   PrintOMPExecutableDirective(Node);
diff --git a/clang/lib/AST/StmtProfile.cpp b/clang/lib/AST/StmtProfile.cpp
index 8b3af94..589a156 100644
--- a/clang/lib/AST/StmtProfile.cpp
+++ b/clang/lib/AST/StmtProfile.cpp
@@ -510,6 +510,13 @@ void OMPClauseProfiler::VisitOMPPartialClause(const OMPPartialClause *C) {
     Profiler->VisitExpr(Factor);
 }
 
+void OMPClauseProfiler::VisitOMPLoopRangeClause(const OMPLoopRangeClause *C) {
+  if (const Expr *First = C->getFirst())
+    Profiler->VisitExpr(First);
+  if (const Expr *Count = C->getCount())
+    Profiler->VisitExpr(Count);
+}
+
 void OMPClauseProfiler::VisitOMPAllocatorClause(const OMPAllocatorClause *C) {
   if (C->getAllocator())
     Profiler->VisitStmt(C->getAllocator());
@@ -1025,6 +1032,15 @@ void StmtProfiler::VisitOMPInterchangeDirective(
   VisitOMPCanonicalLoopNestTransformationDirective(S);
 }
 
+void StmtProfiler::VisitOMPCanonicalLoopSequenceTransformationDirective(
+    const OMPCanonicalLoopSequenceTransformationDirective *S) {
+  VisitOMPExecutableDirective(S);
+}
+
+void StmtProfiler::VisitOMPFuseDirective(const OMPFuseDirective *S) {
+  VisitOMPCanonicalLoopSequenceTransformationDirective(S);
+}
+
 void StmtProfiler::VisitOMPForDirective(const OMPForDirective *S) {
   VisitOMPLoopDirective(S);
 }
diff --git a/clang/lib/Basic/OpenMPKinds.cpp b/clang/lib/Basic/OpenMPKinds.cpp
index 387026e..64b2bff 100644
--- a/clang/lib/Basic/OpenMPKinds.cpp
+++ b/clang/lib/Basic/OpenMPKinds.cpp
@@ -282,6 +282,7 @@ unsigned clang::getOpenMPSimpleClauseType(OpenMPClauseKind Kind, StringRef Str,
   case OMPC_affinity:
   case OMPC_when:
   case OMPC_append_args:
+  case OMPC_looprange:
     break;
   default:
     break;
@@ -627,6 +628,7 @@ const char *clang::getOpenMPSimpleClauseTypeName(OpenMPClauseKind Kind,
   case OMPC_affinity:
   case OMPC_when:
   case OMPC_append_args:
+  case OMPC_looprange:
     break;
   default:
     break;
@@ -755,9 +757,14 @@ bool clang::isOpenMPCanonicalLoopNestTransformationDirective(
          DKind == OMPD_interchange || DKind == OMPD_stripe;
 }
 
+bool clang::isOpenMPCanonicalLoopSequenceTransformationDirective(
+    OpenMPDirectiveKind DKind) {
+  return DKind == OMPD_fuse;
+}
+
 bool clang::isOpenMPLoopTransformationDirective(OpenMPDirectiveKind DKind) {
-  // FIXME: There will be more cases when we implement 'fuse'.
-  return isOpenMPCanonicalLoopNestTransformationDirective(DKind);
+  return isOpenMPCanonicalLoopNestTransformationDirective(DKind) ||
+         isOpenMPCanonicalLoopSequenceTransformationDirective(DKind);
 }
 
 bool clang::isOpenMPCombinedParallelADirective(OpenMPDirectiveKind DKind) {
diff --git a/clang/lib/Basic/TargetInfo.cpp b/clang/lib/Basic/TargetInfo.cpp
index 72ee09d..f4d7c12 100644
--- a/clang/lib/Basic/TargetInfo.cpp
+++ b/clang/lib/Basic/TargetInfo.cpp
@@ -18,6 +18,7 @@
 #include "clang/Basic/LangOptions.h"
 #include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/TargetParser/TargetParser.h"
 #include <cstdlib>
@@ -1042,3 +1043,51 @@ void TargetInfo::copyAuxTarget(const TargetInfo *Aux) {
   auto *Src = static_cast<const TransferrableTargetInfo*>(Aux);
   *Target = *Src;
 }
+
+std::string
+TargetInfo::simplifyConstraint(StringRef Constraint,
+                               SmallVectorImpl<ConstraintInfo> *OutCons) const {
+  std::string Result;
+
+  for (const char *I = Constraint.begin(), *E = Constraint.end(); I < E; I++) {
+    switch (*I) {
+    default:
+      Result += convertConstraint(I);
+      break;
+    // Ignore these
+    case '*':
+    case '?':
+    case '!':
+    case '=': // Will see this and the following in mult-alt constraints.
+    case '+':
+      break;
+    case '#': // Ignore the rest of the constraint alternative.
+      while (I + 1 != E && I[1] != ',')
+        I++;
+      break;
+    case '&':
+    case '%':
+      Result += *I;
+      while (I + 1 != E && I[1] == *I)
+        I++;
+      break;
+    case ',':
+      Result += "|";
+      break;
+    case 'g':
+      Result += "imr";
+      break;
+    case '[': {
+      assert(OutCons &&
+             "Must pass output names to constraints with a symbolic name");
+      unsigned Index;
+      bool ResolveResult = resolveSymbolicName(I, *OutCons, Index);
+      assert(ResolveResult && "Could not resolve symbolic name");
+      (void)ResolveResult;
+      Result += llvm::utostr(Index);
+      break;
+    }
+    }
+  }
+  return Result;
+}
diff --git a/clang/lib/CIR/CodeGen/CIRGenClass.cpp b/clang/lib/CIR/CodeGen/CIRGenClass.cpp
index cb8fe6c..9d12a13 100644
--- a/clang/lib/CIR/CodeGen/CIRGenClass.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenClass.cpp
@@ -951,28 +951,37 @@ Address CIRGenFunction::getAddressOfBaseClass(
     bool nullCheckValue, SourceLocation loc) {
   assert(!path.empty() && "Base path should not be empty!");
 
+  CastExpr::path_const_iterator start = path.begin();
+  const CXXRecordDecl *vBase = nullptr;
+
   if ((*path.begin())->isVirtual()) {
-    // The implementation here is actually complete, but let's flag this
-    // as an error until the rest of the virtual base class support is in place.
-    cgm.errorNYI(loc, "getAddrOfBaseClass: virtual base");
-    return Address::invalid();
+    vBase = (*start)->getType()->castAsCXXRecordDecl();
+    ++start;
   }
 
   // Compute the static offset of the ultimate destination within its
   // allocating subobject (the virtual base, if there is one, or else
   // the "complete" object that we see).
-  CharUnits nonVirtualOffset =
-      cgm.computeNonVirtualBaseClassOffset(derived, path);
+  CharUnits nonVirtualOffset = cgm.computeNonVirtualBaseClassOffset(
+      vBase ? vBase : derived, {start, path.end()});
+
+  // If there's a virtual step, we can sometimes "devirtualize" it.
+  // For now, that's limited to when the derived type is final.
+  // TODO: "devirtualize" this for accesses to known-complete objects.
+  if (vBase && derived->hasAttr<FinalAttr>()) {
+    const ASTRecordLayout &layout = getContext().getASTRecordLayout(derived);
+    CharUnits vBaseOffset = layout.getVBaseClassOffset(vBase);
+    nonVirtualOffset += vBaseOffset;
+    vBase = nullptr; // we no longer have a virtual step
+  }
 
   // Get the base pointer type.
   mlir::Type baseValueTy = convertType((path.end()[-1])->getType());
   assert(!cir::MissingFeatures::addressSpace());
 
-  // The if statement here is redundant now, but it will be needed when we add
-  // support for virtual base classes.
   // If there is no virtual base, use cir.base_class_addr.  It takes care of
   // the adjustment and the null pointer check.
-  if (nonVirtualOffset.isZero()) {
+  if (nonVirtualOffset.isZero() && !vBase) {
     assert(!cir::MissingFeatures::sanitizers());
     return builder.createBaseClassAddr(getLoc(loc), value, baseValueTy, 0,
                                        /*assumeNotNull=*/true);
@@ -980,10 +989,17 @@ Address CIRGenFunction::getAddressOfBaseClass(
 
   assert(!cir::MissingFeatures::sanitizers());
 
-  // Apply the offset
-  value = builder.createBaseClassAddr(getLoc(loc), value, baseValueTy,
-                                      nonVirtualOffset.getQuantity(),
-                                      /*assumeNotNull=*/true);
+  // Compute the virtual offset.
+  mlir::Value virtualOffset = nullptr;
+  if (vBase) {
+    virtualOffset = cgm.getCXXABI().getVirtualBaseClassOffset(
+        getLoc(loc), *this, value, derived, vBase);
+  }
+
+  // Apply both offsets.
+  value = applyNonVirtualAndVirtualOffset(
+      getLoc(loc), *this, value, nonVirtualOffset, virtualOffset, derived,
+      vBase, baseValueTy, not nullCheckValue);
 
   // Cast to the destination type.
   value = value.withElementType(builder, baseValueTy);
diff --git a/clang/lib/CIR/CodeGen/CIRGenExprCXX.cpp b/clang/lib/CIR/CodeGen/CIRGenExprCXX.cpp
index 1f7e3dd..83208bf 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExprCXX.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExprCXX.cpp
@@ -210,6 +210,60 @@ RValue CIRGenFunction::emitCXXMemberOrOperatorCall(
   return emitCall(fnInfo, callee, returnValue, args, nullptr, loc);
 }
 
+namespace {
+/// The parameters to pass to a usual operator delete.
+struct UsualDeleteParams {
+  TypeAwareAllocationMode typeAwareDelete = TypeAwareAllocationMode::No;
+  bool destroyingDelete = false;
+  bool size = false;
+  AlignedAllocationMode alignment = AlignedAllocationMode::No;
+};
+} // namespace
+
+// FIXME(cir): this should be shared with LLVM codegen
+static UsualDeleteParams getUsualDeleteParams(const FunctionDecl *fd) {
+  UsualDeleteParams params;
+
+  const FunctionProtoType *fpt = fd->getType()->castAs<FunctionProtoType>();
+  auto ai = fpt->param_type_begin(), ae = fpt->param_type_end();
+
+  if (fd->isTypeAwareOperatorNewOrDelete()) {
+    params.typeAwareDelete = TypeAwareAllocationMode::Yes;
+    assert(ai != ae);
+    ++ai;
+  }
+
+  // The first argument after the type-identity parameter (if any) is
+  // always a void* (or C* for a destroying operator delete for class
+  // type C).
+  ++ai;
+
+  // The next parameter may be a std::destroying_delete_t.
+  if (fd->isDestroyingOperatorDelete()) {
+    params.destroyingDelete = true;
+    assert(ai != ae);
+    ++ai;
+  }
+
+  // Figure out what other parameters we should be implicitly passing.
+  if (ai != ae && (*ai)->isIntegerType()) {
+    params.size = true;
+    ++ai;
+  } else {
+    assert(!isTypeAwareAllocation(params.typeAwareDelete));
+  }
+
+  if (ai != ae && (*ai)->isAlignValT()) {
+    params.alignment = AlignedAllocationMode::Yes;
+    ++ai;
+  } else {
+    assert(!isTypeAwareAllocation(params.typeAwareDelete));
+  }
+
+  assert(ai == ae && "unexpected usual deallocation function parameter");
+  return params;
+}
+
 static mlir::Value emitCXXNewAllocSize(CIRGenFunction &cgf, const CXXNewExpr *e,
                                        unsigned minElements,
                                        mlir::Value &numElements,
@@ -332,6 +386,117 @@ static RValue emitNewDeleteCall(CIRGenFunction &cgf,
   return rv;
 }
 
+namespace {
+/// Calls the given 'operator delete' on a single object.
+struct CallObjectDelete final : EHScopeStack::Cleanup {
+  mlir::Value ptr;
+  const FunctionDecl *operatorDelete;
+  QualType elementType;
+
+  CallObjectDelete(mlir::Value ptr, const FunctionDecl *operatorDelete,
+                   QualType elementType)
+      : ptr(ptr), operatorDelete(operatorDelete), elementType(elementType) {}
+
+  void emit(CIRGenFunction &cgf) override {
+    cgf.emitDeleteCall(operatorDelete, ptr, elementType);
+  }
+
+  // This is a placeholder until EHCleanupScope is implemented.
+  size_t getSize() const override {
+    assert(!cir::MissingFeatures::ehCleanupScope());
+    return sizeof(CallObjectDelete);
+  }
+};
+} // namespace
+
+/// Emit the code for deleting a single object.
+static void emitObjectDelete(CIRGenFunction &cgf, const CXXDeleteExpr *de,
+                             Address ptr, QualType elementType) {
+  // C++11 [expr.delete]p3:
+  //   If the static type of the object to be deleted is different from its
+  //   dynamic type, the static type shall be a base class of the dynamic type
+  //   of the object to be deleted and the static type shall have a virtual
+  //   destructor or the behavior is undefined.
+  assert(!cir::MissingFeatures::emitTypeCheck());
+
+  const FunctionDecl *operatorDelete = de->getOperatorDelete();
+  assert(!operatorDelete->isDestroyingOperatorDelete());
+
+  // Find the destructor for the type, if applicable.  If the
+  // destructor is virtual, we'll just emit the vcall and return.
+  const CXXDestructorDecl *dtor = nullptr;
+  if (const auto *rd = elementType->getAsCXXRecordDecl()) {
+    if (rd->hasDefinition() && !rd->hasTrivialDestructor()) {
+      dtor = rd->getDestructor();
+
+      if (dtor->isVirtual()) {
+        cgf.cgm.errorNYI(de->getSourceRange(),
+                         "emitObjectDelete: virtual destructor");
+      }
+    }
+  }
+
+  // Make sure that we call delete even if the dtor throws.
+  // This doesn't have to a conditional cleanup because we're going
+  // to pop it off in a second.
+  cgf.ehStack.pushCleanup<CallObjectDelete>(
+      NormalAndEHCleanup, ptr.getPointer(), operatorDelete, elementType);
+
+  if (dtor) {
+    cgf.emitCXXDestructorCall(dtor, Dtor_Complete,
+                              /*ForVirtualBase=*/false,
+                              /*Delegating=*/false, ptr, elementType);
+  } else if (elementType.getObjCLifetime()) {
+    assert(!cir::MissingFeatures::objCLifetime());
+    cgf.cgm.errorNYI(de->getSourceRange(), "emitObjectDelete: ObjCLifetime");
+  }
+
+  // In traditional LLVM codegen null checks are emitted to save a delete call.
+  // In CIR we optimize for size by default, the null check should be added into
+  // this function callers.
+  assert(!cir::MissingFeatures::emitNullCheckForDeleteCalls());
+
+  cgf.popCleanupBlock();
+}
+
+void CIRGenFunction::emitCXXDeleteExpr(const CXXDeleteExpr *e) {
+  const Expr *arg = e->getArgument();
+  Address ptr = emitPointerWithAlignment(arg);
+
+  // Null check the pointer.
+  //
+  // We could avoid this null check if we can determine that the object
+  // destruction is trivial and doesn't require an array cookie; we can
+  // unconditionally perform the operator delete call in that case. For now, we
+  // assume that deleted pointers are null rarely enough that it's better to
+  // keep the branch. This might be worth revisiting for a -O0 code size win.
+  //
+  // CIR note: emit the code size friendly by default for now, such as mentioned
+  // in `emitObjectDelete`.
+  assert(!cir::MissingFeatures::emitNullCheckForDeleteCalls());
+  QualType deleteTy = e->getDestroyedType();
+
+  // A destroying operator delete overrides the entire operation of the
+  // delete expression.
+  if (e->getOperatorDelete()->isDestroyingOperatorDelete()) {
+    cgm.errorNYI(e->getSourceRange(),
+                 "emitCXXDeleteExpr: destroying operator delete");
+    return;
+  }
+
+  // We might be deleting a pointer to array.
+  deleteTy = getContext().getBaseElementType(deleteTy);
+  ptr = ptr.withElementType(builder, convertTypeForMem(deleteTy));
+
+  if (e->isArrayForm()) {
+    assert(!cir::MissingFeatures::deleteArray());
+    cgm.errorNYI(e->getSourceRange(), "emitCXXDeleteExpr: array delete");
+    return;
+  } else {
+    emitObjectDelete(*this, e, ptr, deleteTy);
+  }
+}
+
 mlir::Value CIRGenFunction::emitCXXNewExpr(const CXXNewExpr *e) {
   // The element type being allocated.
   QualType allocType = getContext().getBaseElementType(e->getAllocatedType());
@@ -443,3 +608,53 @@ mlir::Value CIRGenFunction::emitCXXNewExpr(const CXXNewExpr *e) {
                      allocSizeWithoutCookie);
   return result.getPointer();
 }
+
+void CIRGenFunction::emitDeleteCall(const FunctionDecl *deleteFD,
+                                    mlir::Value ptr, QualType deleteTy) {
+  assert(!cir::MissingFeatures::deleteArray());
+
+  const auto *deleteFTy = deleteFD->getType()->castAs<FunctionProtoType>();
+  CallArgList deleteArgs;
+
+  UsualDeleteParams params = getUsualDeleteParams(deleteFD);
+  auto paramTypeIt = deleteFTy->param_type_begin();
+
+  // Pass std::type_identity tag if present
+  if (isTypeAwareAllocation(params.typeAwareDelete))
+    cgm.errorNYI(deleteFD->getSourceRange(),
+                 "emitDeleteCall: type aware delete");
+
+  // Pass the pointer itself.
+  QualType argTy = *paramTypeIt++;
+  mlir::Value deletePtr =
+      builder.createBitcast(ptr.getLoc(), ptr, convertType(argTy));
+  deleteArgs.add(RValue::get(deletePtr), argTy);
+
+  // Pass the std::destroying_delete tag if present.
+  if (params.destroyingDelete)
+    cgm.errorNYI(deleteFD->getSourceRange(),
+                 "emitDeleteCall: destroying delete");
+
+  // Pass the size if the delete function has a size_t parameter.
+  if (params.size) {
+    QualType sizeType = *paramTypeIt++;
+    CharUnits deleteTypeSize = getContext().getTypeSizeInChars(deleteTy);
+    assert(mlir::isa<cir::IntType>(convertType(sizeType)) &&
+           "expected cir::IntType");
+    cir::ConstantOp size = builder.getConstInt(
+        *currSrcLoc, convertType(sizeType), deleteTypeSize.getQuantity());
+
+    deleteArgs.add(RValue::get(size), sizeType);
+  }
+
+  // Pass the alignment if the delete function has an align_val_t parameter.
+  if (isAlignedAllocation(params.alignment))
+    cgm.errorNYI(deleteFD->getSourceRange(),
+                 "emitDeleteCall: aligned allocation");
+
+  assert(paramTypeIt == deleteFTy->param_type_end() &&
+         "unknown parameter to usual delete function");
+
+  // Emit the call to delete.
+  emitNewDeleteCall(*this, deleteFD, deleteFTy, deleteArgs);
+}
diff --git a/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp b/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp
index bd09d78..f4bbced 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp
@@ -676,6 +676,10 @@ public:
   mlir::Value VisitRealImag(const UnaryOperator *e,
                             QualType promotionType = QualType());
 
+  mlir::Value VisitUnaryExtension(const UnaryOperator *e) {
+    return Visit(e->getSubExpr());
+  }
+
   mlir::Value VisitCXXDefaultInitExpr(CXXDefaultInitExpr *die) {
     CIRGenFunction::CXXDefaultInitExprScope scope(cgf, die);
     return Visit(die->getExpr());
@@ -687,6 +691,10 @@ public:
   mlir::Value VisitCXXNewExpr(const CXXNewExpr *e) {
     return cgf.emitCXXNewExpr(e);
   }
+  mlir::Value VisitCXXDeleteExpr(const CXXDeleteExpr *e) {
+    cgf.emitCXXDeleteExpr(e);
+    return {};
+  }
 
   mlir::Value VisitCXXThrowExpr(const CXXThrowExpr *e) {
     cgf.emitCXXThrowExpr(e);
@@ -1274,9 +1282,6 @@ mlir::Value ScalarExprEmitter::emitPromoted(const Expr *e,
   } else if (const auto *uo = dyn_cast<UnaryOperator>(e)) {
     switch (uo->getOpcode()) {
     case UO_Imag:
-      cgf.cgm.errorNYI(e->getSourceRange(),
-                       "ScalarExprEmitter::emitPromoted unary imag");
-      return {};
     case UO_Real:
       return VisitRealImag(uo, promotionType);
     case UO_Minus:
diff --git a/clang/lib/CIR/CodeGen/CIRGenFunction.h b/clang/lib/CIR/CodeGen/CIRGenFunction.h
index 166435f..ef07db3 100644
--- a/clang/lib/CIR/CodeGen/CIRGenFunction.h
+++ b/clang/lib/CIR/CodeGen/CIRGenFunction.h
@@ -1197,6 +1197,8 @@ public:
                               bool delegating, Address thisAddr,
                               CallArgList &args, clang::SourceLocation loc);
 
+  void emitCXXDeleteExpr(const CXXDeleteExpr *e);
+
   void emitCXXDestructorCall(const CXXDestructorDecl *dd, CXXDtorType type,
                              bool forVirtualBase, bool delegating,
                              Address thisAddr, QualType thisTy);
@@ -1244,6 +1246,9 @@ public:
   void emitDelegatingCXXConstructorCall(const CXXConstructorDecl *ctor,
                                         const FunctionArgList &args);
 
+  void emitDeleteCall(const FunctionDecl *deleteFD, mlir::Value ptr,
+                      QualType deleteTy);
+
   mlir::LogicalResult emitDoStmt(const clang::DoStmt &s);
 
   /// Emit an expression as an initializer for an object (variable, field, etc.)
diff --git a/clang/lib/CIR/CodeGen/CIRGenModule.cpp b/clang/lib/CIR/CodeGen/CIRGenModule.cpp
index eef23a0..c977ff9 100644
--- a/clang/lib/CIR/CodeGen/CIRGenModule.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenModule.cpp
@@ -119,6 +119,19 @@ CIRGenModule::CIRGenModule(mlir::MLIRContext &mlirContext,
                        cir::OptInfoAttr::get(&mlirContext,
                                              cgo.OptimizationLevel,
                                              cgo.OptimizeSize));
+  // Set the module name to be the name of the main file. TranslationUnitDecl
+  // often contains invalid source locations and isn't a reliable source for the
+  // module location.
+  FileID mainFileId = astContext.getSourceManager().getMainFileID();
+  const FileEntry &mainFile =
+      *astContext.getSourceManager().getFileEntryForID(mainFileId);
+  StringRef path = mainFile.tryGetRealPathName();
+  if (!path.empty()) {
+    theModule.setSymName(path);
+    theModule->setLoc(mlir::FileLineColLoc::get(&mlirContext, path,
+                                                /*line=*/0,
+                                                /*column=*/0));
+  }
 }
 
 CIRGenModule::~CIRGenModule() = default;
diff --git a/clang/lib/CIR/CodeGen/CIRGenStmt.cpp b/clang/lib/CIR/CodeGen/CIRGenStmt.cpp
index e842892..644c383 100644
--- a/clang/lib/CIR/CodeGen/CIRGenStmt.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenStmt.cpp
@@ -216,6 +216,7 @@ mlir::LogicalResult CIRGenFunction::emitStmt(const Stmt *s,
   case Stmt::OMPSimdDirectiveClass:
   case Stmt::OMPTileDirectiveClass:
   case Stmt::OMPUnrollDirectiveClass:
+  case Stmt::OMPFuseDirectiveClass:
   case Stmt::OMPForDirectiveClass:
   case Stmt::OMPForSimdDirectiveClass:
   case Stmt::OMPSectionsDirectiveClass:
diff --git a/clang/lib/CIR/Dialect/IR/CIRDialect.cpp b/clang/lib/CIR/Dialect/IR/CIRDialect.cpp
index 58ef500..fb87036 100644
--- a/clang/lib/CIR/Dialect/IR/CIRDialect.cpp
+++ b/clang/lib/CIR/Dialect/IR/CIRDialect.cpp
@@ -1355,9 +1355,11 @@ mlir::LogicalResult cir::GlobalOp::verify() {
   return success();
 }
 
-void cir::GlobalOp::build(OpBuilder &odsBuilder, OperationState &odsState,
-                          llvm::StringRef sym_name, mlir::Type sym_type,
-                          bool isConstant, cir::GlobalLinkageKind linkage) {
+void cir::GlobalOp::build(
+    OpBuilder &odsBuilder, OperationState &odsState, llvm::StringRef sym_name,
+    mlir::Type sym_type, bool isConstant, cir::GlobalLinkageKind linkage,
+    function_ref<void(OpBuilder &, Location)> ctorBuilder,
+    function_ref<void(OpBuilder &, Location)> dtorBuilder) {
   odsState.addAttribute(getSymNameAttrName(odsState.name),
                         odsBuilder.getStringAttr(sym_name));
   odsState.addAttribute(getSymTypeAttrName(odsState.name),
@@ -1370,26 +1372,88 @@ void cir::GlobalOp::build(OpBuilder &odsBuilder, OperationState &odsState,
       cir::GlobalLinkageKindAttr::get(odsBuilder.getContext(), linkage);
   odsState.addAttribute(getLinkageAttrName(odsState.name), linkageAttr);
 
+  Region *ctorRegion = odsState.addRegion();
+  if (ctorBuilder) {
+    odsBuilder.createBlock(ctorRegion);
+    ctorBuilder(odsBuilder, odsState.location);
+  }
+
+  Region *dtorRegion = odsState.addRegion();
+  if (dtorBuilder) {
+    odsBuilder.createBlock(dtorRegion);
+    dtorBuilder(odsBuilder, odsState.location);
+  }
+
   odsState.addAttribute(getGlobalVisibilityAttrName(odsState.name),
                         cir::VisibilityAttr::get(odsBuilder.getContext()));
 }
 
+/// Given the region at `index`, or the parent operation if `index` is None,
+/// return the successor regions. These are the regions that may be selected
+/// during the flow of control. `operands` is a set of optional attributes that
+/// correspond to a constant value for each operand, or null if that operand is
+/// not a constant.
+void cir::GlobalOp::getSuccessorRegions(
+    mlir::RegionBranchPoint point, SmallVectorImpl<RegionSuccessor> &regions) {
+  // The `ctor` and `dtor` regions always branch back to the parent operation.
+  if (!point.isParent()) {
+    regions.push_back(RegionSuccessor());
+    return;
+  }
+
+  // Don't consider the ctor region if it is empty.
+  Region *ctorRegion = &this->getCtorRegion();
+  if (ctorRegion->empty())
+    ctorRegion = nullptr;
+
+  // Don't consider the dtor region if it is empty.
+  Region *dtorRegion = &this->getCtorRegion();
+  if (dtorRegion->empty())
+    dtorRegion = nullptr;
+
+  // If the condition isn't constant, both regions may be executed.
+  if (ctorRegion)
+    regions.push_back(RegionSuccessor(ctorRegion));
+  if (dtorRegion)
+    regions.push_back(RegionSuccessor(dtorRegion));
+}
+
 static void printGlobalOpTypeAndInitialValue(OpAsmPrinter &p, cir::GlobalOp op,
-                                             TypeAttr type,
-                                             Attribute initAttr) {
+                                             TypeAttr type, Attribute initAttr,
+                                             mlir::Region &ctorRegion,
+                                             mlir::Region &dtorRegion) {
+  auto printType = [&]() { p << ": " << type; };
   if (!op.isDeclaration()) {
     p << "= ";
-    // This also prints the type...
-    if (initAttr)
-      printConstant(p, initAttr);
+    if (!ctorRegion.empty()) {
+      p << "ctor ";
+      printType();
+      p << " ";
+      p.printRegion(ctorRegion,
+                    /*printEntryBlockArgs=*/false,
+                    /*printBlockTerminators=*/false);
+    } else {
+      // This also prints the type...
+      if (initAttr)
+        printConstant(p, initAttr);
+    }
+
+    if (!dtorRegion.empty()) {
+      p << " dtor ";
+      p.printRegion(dtorRegion,
+                    /*printEntryBlockArgs=*/false,
+                    /*printBlockTerminators=*/false);
+    }
   } else {
-    p << ": " << type;
+    printType();
   }
 }
 
-static ParseResult
-parseGlobalOpTypeAndInitialValue(OpAsmParser &parser, TypeAttr &typeAttr,
-                                 Attribute &initialValueAttr) {
+static ParseResult parseGlobalOpTypeAndInitialValue(OpAsmParser &parser,
+                                                    TypeAttr &typeAttr,
+                                                    Attribute &initialValueAttr,
+                                                    mlir::Region &ctorRegion,
+                                                    mlir::Region &dtorRegion) {
   mlir::Type opTy;
   if (parser.parseOptionalEqual().failed()) {
     // Absence of equal means a declaration, so we need to parse the type.
@@ -1397,16 +1461,38 @@ parseGlobalOpTypeAndInitialValue(OpAsmParser &parser, TypeAttr &typeAttr,
     if (parser.parseColonType(opTy))
       return failure();
   } else {
-    // Parse constant with initializer, examples:
-    //  cir.global @y = #cir.fp<1.250000e+00> : !cir.double
-    //  cir.global @rgb = #cir.const_array<[...] : !cir.array<i8 x 3>>
-    if (parseConstantValue(parser, initialValueAttr).failed())
-      return failure();
+    // Parse contructor, example:
+    //  cir.global @rgb = ctor : type { ... }
+    if (!parser.parseOptionalKeyword("ctor")) {
+      if (parser.parseColonType(opTy))
+        return failure();
+      auto parseLoc = parser.getCurrentLocation();
+      if (parser.parseRegion(ctorRegion, /*arguments=*/{}, /*argTypes=*/{}))
+        return failure();
+      if (ensureRegionTerm(parser, ctorRegion, parseLoc).failed())
+        return failure();
+    } else {
+      // Parse constant with initializer, examples:
+      //  cir.global @y = 3.400000e+00 : f32
+      //  cir.global @rgb = #cir.const_array<[...] : !cir.array<i8 x 3>>
+      if (parseConstantValue(parser, initialValueAttr).failed())
+        return failure();
+
+      assert(mlir::isa<mlir::TypedAttr>(initialValueAttr) &&
+             "Non-typed attrs shouldn't appear here.");
+      auto typedAttr = mlir::cast<mlir::TypedAttr>(initialValueAttr);
+      opTy = typedAttr.getType();
+    }
 
-    assert(mlir::isa<mlir::TypedAttr>(initialValueAttr) &&
-           "Non-typed attrs shouldn't appear here.");
-    auto typedAttr = mlir::cast<mlir::TypedAttr>(initialValueAttr);
-    opTy = typedAttr.getType();
+    // Parse destructor, example:
+    //   dtor { ... }
+    if (!parser.parseOptionalKeyword("dtor")) {
+      auto parseLoc = parser.getCurrentLocation();
+      if (parser.parseRegion(dtorRegion, /*arguments=*/{}, /*argTypes=*/{}))
+        return failure();
+      if (ensureRegionTerm(parser, dtorRegion, parseLoc).failed())
+        return failure();
+    }
   }
 
   typeAttr = TypeAttr::get(opTy);
diff --git a/clang/lib/CodeGen/BackendUtil.cpp b/clang/lib/CodeGen/BackendUtil.cpp
index 57db20f7..64f1917 100644
--- a/clang/lib/CodeGen/BackendUtil.cpp
+++ b/clang/lib/CodeGen/BackendUtil.cpp
@@ -1090,8 +1090,9 @@ void EmitAssemblyHelper::RunOptimizationPipeline(
     if (std::optional<GCOVOptions> Options =
             getGCOVOptions(CodeGenOpts, LangOpts))
       PB.registerPipelineStartEPCallback(
-          [Options](ModulePassManager &MPM, OptimizationLevel Level) {
-            MPM.addPass(GCOVProfilerPass(*Options));
+          [this, Options](ModulePassManager &MPM, OptimizationLevel Level) {
+            MPM.addPass(
+                GCOVProfilerPass(*Options, CI.getVirtualFileSystemPtr()));
           });
     if (std::optional<InstrProfOptions> Options =
             getInstrProfOptions(CodeGenOpts, LangOpts))
diff --git a/clang/lib/CodeGen/CGDebugInfo.cpp b/clang/lib/CodeGen/CGDebugInfo.cpp
index 12c7d48..fee6bc0 100644
--- a/clang/lib/CodeGen/CGDebugInfo.cpp
+++ b/clang/lib/CodeGen/CGDebugInfo.cpp
@@ -26,6 +26,7 @@
 #include "clang/AST/DeclObjC.h"
 #include "clang/AST/DeclTemplate.h"
 #include "clang/AST/Expr.h"
+#include "clang/AST/LambdaCapture.h"
 #include "clang/AST/RecordLayout.h"
 #include "clang/AST/RecursiveASTVisitor.h"
 #include "clang/AST/VTableBuilder.h"
@@ -1903,46 +1904,61 @@ CGDebugInfo::createInlinedSubprogram(StringRef FuncName,
   return SP;
 }
 
+llvm::StringRef
+CGDebugInfo::GetLambdaCaptureName(const LambdaCapture &Capture) {
+  if (Capture.capturesThis())
+    return CGM.getCodeGenOpts().EmitCodeView ? "__this" : "this";
+
+  assert(Capture.capturesVariable());
+
+  const ValueDecl *CaptureDecl = Capture.getCapturedVar();
+  assert(CaptureDecl && "Expected valid decl for captured variable.");
+
+  return CaptureDecl->getName();
+}
+
 void CGDebugInfo::CollectRecordLambdaFields(
     const CXXRecordDecl *CXXDecl, SmallVectorImpl<llvm::Metadata *> &elements,
     llvm::DIType *RecordTy) {
   // For C++11 Lambdas a Field will be the same as a Capture, but the Capture
   // has the name and the location of the variable so we should iterate over
   // both concurrently.
-  const ASTRecordLayout &layout = CGM.getContext().getASTRecordLayout(CXXDecl);
   RecordDecl::field_iterator Field = CXXDecl->field_begin();
   unsigned fieldno = 0;
   for (CXXRecordDecl::capture_const_iterator I = CXXDecl->captures_begin(),
                                              E = CXXDecl->captures_end();
        I != E; ++I, ++Field, ++fieldno) {
-    const LambdaCapture &C = *I;
-    if (C.capturesVariable()) {
-      SourceLocation Loc = C.getLocation();
-      assert(!Field->isBitField() && "lambdas don't have bitfield members!");
-      ValueDecl *V = C.getCapturedVar();
-      StringRef VName = V->getName();
-      llvm::DIFile *VUnit = getOrCreateFile(Loc);
-      auto Align = getDeclAlignIfRequired(V, CGM.getContext());
-      llvm::DIType *FieldType = createFieldType(
-          VName, Field->getType(), Loc, Field->getAccess(),
-          layout.getFieldOffset(fieldno), Align, VUnit, RecordTy, CXXDecl);
-      elements.push_back(FieldType);
-    } else if (C.capturesThis()) {
+    const LambdaCapture &Capture = *I;
+    const uint64_t FieldOffset =
+        CGM.getContext().getASTRecordLayout(CXXDecl).getFieldOffset(fieldno);
+
+    assert(!Field->isBitField() && "lambdas don't have bitfield members!");
+
+    SourceLocation Loc;
+    uint32_t Align = 0;
+
+    if (Capture.capturesThis()) {
       // TODO: Need to handle 'this' in some way by probably renaming the
       // this of the lambda class and having a field member of 'this' or
       // by using AT_object_pointer for the function and having that be
       // used as 'this' for semantic references.
-      FieldDecl *f = *Field;
-      llvm::DIFile *VUnit = getOrCreateFile(f->getLocation());
-      QualType type = f->getType();
-      StringRef ThisName =
-          CGM.getCodeGenOpts().EmitCodeView ? "__this" : "this";
-      llvm::DIType *fieldType = createFieldType(
-          ThisName, type, f->getLocation(), f->getAccess(),
-          layout.getFieldOffset(fieldno), VUnit, RecordTy, CXXDecl);
-
-      elements.push_back(fieldType);
+      Loc = Field->getLocation();
+    } else if (Capture.capturesVariable()) {
+      Loc = Capture.getLocation();
+
+      const ValueDecl *CaptureDecl = Capture.getCapturedVar();
+      assert(CaptureDecl && "Expected valid decl for captured variable.");
+
+      Align = getDeclAlignIfRequired(CaptureDecl, CGM.getContext());
+    } else {
+      continue;
     }
+
+    llvm::DIFile *VUnit = getOrCreateFile(Loc);
+
+    elements.push_back(createFieldType(
+        GetLambdaCaptureName(Capture), Field->getType(), Loc,
+        Field->getAccess(), FieldOffset, Align, VUnit, RecordTy, CXXDecl));
   }
 }
 
diff --git a/clang/lib/CodeGen/CGDebugInfo.h b/clang/lib/CodeGen/CGDebugInfo.h
index f860773..78c3eb9 100644
--- a/clang/lib/CodeGen/CGDebugInfo.h
+++ b/clang/lib/CodeGen/CGDebugInfo.h
@@ -397,6 +397,7 @@ private:
   void CollectRecordFields(const RecordDecl *Decl, llvm::DIFile *F,
                            SmallVectorImpl<llvm::Metadata *> &E,
                            llvm::DICompositeType *RecordTy);
+  llvm::StringRef GetLambdaCaptureName(const LambdaCapture &Capture);
 
   /// If the C++ class has vtable info then insert appropriate debug
   /// info entry in EltTys vector.
diff --git a/clang/lib/CodeGen/CGStmt.cpp b/clang/lib/CodeGen/CGStmt.cpp
index 4401006..92636f2 100644
--- a/clang/lib/CodeGen/CGStmt.cpp
+++ b/clang/lib/CodeGen/CGStmt.cpp
@@ -234,6 +234,9 @@ void CodeGenFunction::EmitStmt(const Stmt *S, ArrayRef<const Attr *> Attrs) {
   case Stmt::OMPInterchangeDirectiveClass:
     EmitOMPInterchangeDirective(cast<OMPInterchangeDirective>(*S));
     break;
+  case Stmt::OMPFuseDirectiveClass:
+    EmitOMPFuseDirective(cast<OMPFuseDirective>(*S));
+    break;
   case Stmt::OMPForDirectiveClass:
     EmitOMPForDirective(cast<OMPForDirective>(*S));
     break;
@@ -2471,56 +2474,6 @@ void CodeGenFunction::EmitSwitchStmt(const SwitchStmt &S) {
   CaseRangeBlock = SavedCRBlock;
 }
 
-static std::string
-SimplifyConstraint(const char *Constraint, const TargetInfo &Target,
-                 SmallVectorImpl<TargetInfo::ConstraintInfo> *OutCons=nullptr) {
-  std::string Result;
-
-  while (*Constraint) {
-    switch (*Constraint) {
-    default:
-      Result += Target.convertConstraint(Constraint);
-      break;
-    // Ignore these
-    case '*':
-    case '?':
-    case '!':
-    case '=': // Will see this and the following in mult-alt constraints.
-    case '+':
-      break;
-    case '#': // Ignore the rest of the constraint alternative.
-      while (Constraint[1] && Constraint[1] != ',')
-        Constraint++;
-      break;
-    case '&':
-    case '%':
-      Result += *Constraint;
-      while (Constraint[1] && Constraint[1] == *Constraint)
-        Constraint++;
-      break;
-    case ',':
-      Result += "|";
-      break;
-    case 'g':
-      Result += "imr";
-      break;
-    case '[': {
-      assert(OutCons &&
-             "Must pass output names to constraints with a symbolic name");
-      unsigned Index;
-      bool result = Target.resolveSymbolicName(Constraint, *OutCons, Index);
-      assert(result && "Could not resolve symbolic name"); (void)result;
-      Result += llvm::utostr(Index);
-      break;
-    }
-    }
-
-    Constraint++;
-  }
-
-  return Result;
-}
-
 /// AddVariableConstraints - Look at AsmExpr and if it is a variable declared
 /// as using a particular register add that as a constraint that will be used
 /// in this asm stmt.
@@ -2899,8 +2852,8 @@ void CodeGenFunction::EmitAsmStmt(const AsmStmt &S) {
 
     // Simplify the output constraint.
     std::string OutputConstraint(S.getOutputConstraint(i));
-    OutputConstraint = SimplifyConstraint(OutputConstraint.c_str() + 1,
-                                          getTarget(), &OutputConstraintInfos);
+    OutputConstraint = getTarget().simplifyConstraint(
+        StringRef(OutputConstraint).substr(1), &OutputConstraintInfos);
 
     const Expr *OutExpr = S.getOutputExpr(i);
     OutExpr = OutExpr->IgnoreParenNoopCasts(getContext());
@@ -3062,8 +3015,8 @@ void CodeGenFunction::EmitAsmStmt(const AsmStmt &S) {
 
     // Simplify the input constraint.
     std::string InputConstraint(S.getInputConstraint(i));
-    InputConstraint = SimplifyConstraint(InputConstraint.c_str(), getTarget(),
-                                         &OutputConstraintInfos);
+    InputConstraint =
+        getTarget().simplifyConstraint(InputConstraint, &OutputConstraintInfos);
 
     InputConstraint = AddVariableConstraints(
         InputConstraint, *InputExpr->IgnoreParenNoopCasts(getContext()),
diff --git a/clang/lib/CodeGen/CGStmtOpenMP.cpp b/clang/lib/CodeGen/CGStmtOpenMP.cpp
index ba9c7c6..efc06a2 100644
--- a/clang/lib/CodeGen/CGStmtOpenMP.cpp
+++ b/clang/lib/CodeGen/CGStmtOpenMP.cpp
@@ -201,6 +201,24 @@ class OMPLoopScope : public CodeGenFunction::RunCleanupsScope {
     } else {
       llvm_unreachable("Unknown loop-based directive kind.");
     }
+    doEmitPreinits(PreInits);
+    PreCondVars.restore(CGF);
+  }
+
+  void
+  emitPreInitStmt(CodeGenFunction &CGF,
+                  const OMPCanonicalLoopSequenceTransformationDirective &S) {
+    const Stmt *PreInits;
+    if (const auto *Fuse = dyn_cast<OMPFuseDirective>(&S)) {
+      PreInits = Fuse->getPreInits();
+    } else {
+      llvm_unreachable(
+          "Unknown canonical loop sequence transform directive kind.");
+    }
+    doEmitPreinits(PreInits);
+  }
+
+  void doEmitPreinits(const Stmt *PreInits) {
     if (PreInits) {
       // CompoundStmts and DeclStmts are used as lists of PreInit statements and
       // declarations. Since declarations must be visible in the the following
@@ -222,7 +240,6 @@ class OMPLoopScope : public CodeGenFunction::RunCleanupsScope {
         CGF.EmitStmt(S);
       }
     }
-    PreCondVars.restore(CGF);
   }
 
 public:
@@ -230,6 +247,11 @@ public:
       : CodeGenFunction::RunCleanupsScope(CGF) {
     emitPreInitStmt(CGF, S);
   }
+  OMPLoopScope(CodeGenFunction &CGF,
+               const OMPCanonicalLoopSequenceTransformationDirective &S)
+      : CodeGenFunction::RunCleanupsScope(CGF) {
+    emitPreInitStmt(CGF, S);
+  }
 };
 
 class OMPSimdLexicalScope : public CodeGenFunction::LexicalScope {
@@ -1929,6 +1951,15 @@ public:
       CGSI = new CodeGenFunction::CGCapturedStmtInfo(CR_OpenMP);
       CapInfoRAII = new CodeGenFunction::CGCapturedStmtRAII(CGF, CGSI);
     }
+    if (const auto *Dir =
+            dyn_cast<OMPCanonicalLoopSequenceTransformationDirective>(S)) {
+      // For simplicity we reuse the loop scope similarly to what we do with
+      // OMPCanonicalLoopNestTransformationDirective do by being a subclass
+      // of OMPLoopBasedDirective.
+      Scope = new OMPLoopScope(CGF, *Dir);
+      CGSI = new CodeGenFunction::CGCapturedStmtInfo(CR_OpenMP);
+      CapInfoRAII = new CodeGenFunction::CGCapturedStmtRAII(CGF, CGSI);
+    }
   }
   ~OMPTransformDirectiveScopeRAII() {
     if (!Scope)
@@ -1956,8 +1987,7 @@ static void emitBody(CodeGenFunction &CGF, const Stmt *S, const Stmt *NextLoop,
     return;
   }
   if (SimplifiedS == NextLoop) {
-    if (auto *Dir =
-            dyn_cast<OMPCanonicalLoopNestTransformationDirective>(SimplifiedS))
+    if (auto *Dir = dyn_cast<OMPLoopTransformationDirective>(SimplifiedS))
       SimplifiedS = Dir->getTransformedStmt();
     if (const auto *CanonLoop = dyn_cast<OMPCanonicalLoop>(SimplifiedS))
       SimplifiedS = CanonLoop->getLoopStmt();
@@ -2952,6 +2982,12 @@ void CodeGenFunction::EmitOMPInterchangeDirective(
   EmitStmt(S.getTransformedStmt());
 }
 
+void CodeGenFunction::EmitOMPFuseDirective(const OMPFuseDirective &S) {
+  // Emit the de-sugared statement
+  OMPTransformDirectiveScopeRAII FuseScope(*this, &S);
+  EmitStmt(S.getTransformedStmt());
+}
+
 void CodeGenFunction::EmitOMPUnrollDirective(const OMPUnrollDirective &S) {
   bool UseOMPIRBuilder = CGM.getLangOpts().OpenMPIRBuilder;
 
diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h
index 727487b..f0565c1 100644
--- a/clang/lib/CodeGen/CodeGenFunction.h
+++ b/clang/lib/CodeGen/CodeGenFunction.h
@@ -3861,6 +3861,7 @@ public:
   void EmitOMPUnrollDirective(const OMPUnrollDirective &S);
   void EmitOMPReverseDirective(const OMPReverseDirective &S);
   void EmitOMPInterchangeDirective(const OMPInterchangeDirective &S);
+  void EmitOMPFuseDirective(const OMPFuseDirective &S);
   void EmitOMPForDirective(const OMPForDirective &S);
   void EmitOMPForSimdDirective(const OMPForSimdDirective &S);
   void EmitOMPScopeDirective(const OMPScopeDirective &S);
diff --git a/clang/lib/CodeGen/CodeGenPGO.cpp b/clang/lib/CodeGen/CodeGenPGO.cpp
index 98b30e08..8f09564 100644
--- a/clang/lib/CodeGen/CodeGenPGO.cpp
+++ b/clang/lib/CodeGen/CodeGenPGO.cpp
@@ -972,7 +972,7 @@ void PGOHash::combine(HashType Type) {
   if (Count && Count % NumTypesPerWord == 0) {
     using namespace llvm::support;
     uint64_t Swapped =
-        endian::byte_swap<uint64_t, llvm::endianness::little>(Working);
+        endian::byte_swap<uint64_t>(Working, llvm::endianness::little);
     MD5.update(llvm::ArrayRef((uint8_t *)&Swapped, sizeof(Swapped)));
     Working = 0;
   }
@@ -999,7 +999,7 @@ uint64_t PGOHash::finalize() {
     } else {
       using namespace llvm::support;
       uint64_t Swapped =
-          endian::byte_swap<uint64_t, llvm::endianness::little>(Working);
+          endian::byte_swap<uint64_t>(Working, llvm::endianness::little);
       MD5.update(llvm::ArrayRef((uint8_t *)&Swapped, sizeof(Swapped)));
     }
   }
diff --git a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
index 07cf08c..6596ec0 100644
--- a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
@@ -192,9 +192,17 @@ static Value *emitFPIntBuiltin(CodeGenFunction &CGF,
   return CGF.Builder.CreateCall(F, {Src0, Src1});
 }
 
+static inline StringRef mapScopeToSPIRV(StringRef AMDGCNScope) {
+  if (AMDGCNScope == "agent")
+    return "device";
+  if (AMDGCNScope == "wavefront")
+    return "subgroup";
+  return AMDGCNScope;
+}
+
 // For processing memory ordering and memory scope arguments of various
 // amdgcn builtins.
-// \p Order takes a C++11 comptabile memory-ordering specifier and converts
+// \p Order takes a C++11 compatible memory-ordering specifier and converts
 // it into LLVM's memory ordering specifier using atomic C ABI, and writes
 // to \p AO. \p Scope takes a const char * and converts it into AMDGCN
 // specific SyncScopeID and writes it to \p SSID.
@@ -227,6 +235,8 @@ void CodeGenFunction::ProcessOrderScopeAMDGCN(Value *Order, Value *Scope,
   // Some of the atomic builtins take the scope as a string name.
   StringRef scp;
   if (llvm::getConstantStringInfo(Scope, scp)) {
+    if (getTarget().getTriple().isSPIRV())
+      scp = mapScopeToSPIRV(scp);
     SSID = getLLVMContext().getOrInsertSyncScopeID(scp);
     return;
   }
@@ -238,13 +248,19 @@ void CodeGenFunction::ProcessOrderScopeAMDGCN(Value *Order, Value *Scope,
     SSID = llvm::SyncScope::System;
     break;
   case 1: // __MEMORY_SCOPE_DEVICE
-    SSID = getLLVMContext().getOrInsertSyncScopeID("agent");
+    if (getTarget().getTriple().isSPIRV())
+      SSID = getLLVMContext().getOrInsertSyncScopeID("device");
+    else
+      SSID = getLLVMContext().getOrInsertSyncScopeID("agent");
     break;
   case 2: // __MEMORY_SCOPE_WRKGRP
     SSID = getLLVMContext().getOrInsertSyncScopeID("workgroup");
     break;
   case 3: // __MEMORY_SCOPE_WVFRNT
-    SSID = getLLVMContext().getOrInsertSyncScopeID("wavefront");
+    if (getTarget().getTriple().isSPIRV())
+      SSID = getLLVMContext().getOrInsertSyncScopeID("subgroup");
+    else
+      SSID = getLLVMContext().getOrInsertSyncScopeID("wavefront");
     break;
   case 4: // __MEMORY_SCOPE_SINGLE
     SSID = llvm::SyncScope::SingleThread;
@@ -1510,7 +1526,10 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
       //
       // The global/flat cases need to use agent scope to consistently produce
       // the native instruction instead of a cmpxchg expansion.
-      SSID = getLLVMContext().getOrInsertSyncScopeID("agent");
+      if (getTarget().getTriple().isSPIRV())
+        SSID = getLLVMContext().getOrInsertSyncScopeID("device");
+      else
+        SSID = getLLVMContext().getOrInsertSyncScopeID("agent");
       AO = AtomicOrdering::Monotonic;
 
       // The v2bf16 builtin uses i16 instead of a natural bfloat type.
diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp
index f110dba..85a13357 100644
--- a/clang/lib/Driver/Driver.cpp
+++ b/clang/lib/Driver/Driver.cpp
@@ -6613,6 +6613,9 @@ std::string Driver::GetStdModuleManifestPath(const Compilation &C,
                                              const ToolChain &TC) const {
   std::string error = "<NOT PRESENT>";
 
+  if (C.getArgs().hasArg(options::OPT_nostdlib))
+    return error;
+
   switch (TC.GetCXXStdlibType(C.getArgs())) {
   case ToolChain::CST_Libcxx: {
     auto evaluate = [&](const char *library) -> std::optional<std::string> {
diff --git a/clang/lib/Format/TokenAnnotator.cpp b/clang/lib/Format/TokenAnnotator.cpp
index 6a8286d..67066a1 100644
--- a/clang/lib/Format/TokenAnnotator.cpp
+++ b/clang/lib/Format/TokenAnnotator.cpp
@@ -833,11 +833,6 @@ private:
           if (Parent && Parent->is(TT_PointerOrReference))
             Parent->overwriteFixedType(TT_BinaryOperator);
         }
-        // An arrow after an ObjC method expression is not a lambda arrow.
-        if (CurrentToken->is(TT_ObjCMethodExpr) && CurrentToken->Next &&
-            CurrentToken->Next->is(TT_LambdaArrow)) {
-          CurrentToken->Next->overwriteFixedType(TT_Unknown);
-        }
         Left->MatchingParen = CurrentToken;
         CurrentToken->MatchingParen = Left;
         // FirstObjCSelectorName is set when a colon is found. This does
diff --git a/clang/lib/Format/UnwrappedLineParser.cpp b/clang/lib/Format/UnwrappedLineParser.cpp
index 2c9766c9..6948b3d 100644
--- a/clang/lib/Format/UnwrappedLineParser.cpp
+++ b/clang/lib/Format/UnwrappedLineParser.cpp
@@ -2268,7 +2268,7 @@ bool UnwrappedLineParser::tryToParseLambda() {
   if (!tryToParseLambdaIntroducer())
     return false;
 
-  bool SeenArrow = false;
+  FormatToken *Arrow = nullptr;
   bool InTemplateParameterList = false;
 
   while (FormatTok->isNot(tok::l_brace)) {
@@ -2343,17 +2343,13 @@ bool UnwrappedLineParser::tryToParseLambda() {
     case tok::ellipsis:
     case tok::kw_true:
     case tok::kw_false:
-      if (SeenArrow || InTemplateParameterList) {
+      if (Arrow || InTemplateParameterList) {
         nextToken();
         break;
       }
       return true;
     case tok::arrow:
-      // This might or might not actually be a lambda arrow (this could be an
-      // ObjC method invocation followed by a dereferencing arrow). We might
-      // reset this back to TT_Unknown in TokenAnnotator.
-      FormatTok->setFinalizedType(TT_LambdaArrow);
-      SeenArrow = true;
+      Arrow = FormatTok;
       nextToken();
       break;
     case tok::kw_requires: {
@@ -2375,6 +2371,9 @@ bool UnwrappedLineParser::tryToParseLambda() {
   FormatTok->setFinalizedType(TT_LambdaLBrace);
   LSquare.setFinalizedType(TT_LambdaLSquare);
 
+  if (Arrow)
+    Arrow->setFinalizedType(TT_LambdaArrow);
+
   NestedLambdas.push_back(Line->SeenDecltypeAuto);
   parseChildBlock();
   assert(!NestedLambdas.empty());
@@ -2388,11 +2387,6 @@ bool UnwrappedLineParser::tryToParseLambdaIntroducer() {
   const FormatToken *LeftSquare = FormatTok;
   nextToken();
   if (Previous) {
-    if (Previous->Tok.getIdentifierInfo() &&
-        !Previous->isOneOf(tok::kw_return, tok::kw_co_await, tok::kw_co_yield,
-                           tok::kw_co_return)) {
-      return false;
-    }
     if (Previous->closesScope()) {
       // Not a potential C-style cast.
       if (Previous->isNot(tok::r_paren))
@@ -2402,6 +2396,13 @@ bool UnwrappedLineParser::tryToParseLambdaIntroducer() {
       // and `int (*)()`.
       if (!BeforeRParen || !BeforeRParen->isOneOf(tok::greater, tok::r_paren))
         return false;
+    } else if (Previous->is(tok::star)) {
+      Previous = Previous->getPreviousNonComment();
+    }
+    if (Previous && Previous->Tok.getIdentifierInfo() &&
+        !Previous->isOneOf(tok::kw_return, tok::kw_co_await, tok::kw_co_yield,
+                           tok::kw_co_return)) {
+      return false;
     }
   }
   if (LeftSquare->isCppStructuredBinding(IsCpp))
diff --git a/clang/lib/Frontend/InitPreprocessor.cpp b/clang/lib/Frontend/InitPreprocessor.cpp
index edf0a09..877ab02 100644
--- a/clang/lib/Frontend/InitPreprocessor.cpp
+++ b/clang/lib/Frontend/InitPreprocessor.cpp
@@ -742,7 +742,10 @@ static void InitializeCPlusPlusFeatureTestMacros(const LangOptions &LangOpts,
     Builder.defineMacro("__cpp_impl_coroutine", "201902L");
     Builder.defineMacro("__cpp_designated_initializers", "201707L");
     Builder.defineMacro("__cpp_impl_three_way_comparison", "201907L");
-    //Builder.defineMacro("__cpp_modules", "201907L");
+    // Intentionally to set __cpp_modules to 1.
+    // See https://github.com/llvm/llvm-project/issues/71364 for details.
+    // Builder.defineMacro("__cpp_modules", "201907L");
+    Builder.defineMacro("__cpp_modules", "1");
     Builder.defineMacro("__cpp_using_enum", "201907L");
   }
   // C++23 features.
diff --git a/clang/lib/Frontend/ModuleDependencyCollector.cpp b/clang/lib/Frontend/ModuleDependencyCollector.cpp
index 3b363f9..ff37065 100644
--- a/clang/lib/Frontend/ModuleDependencyCollector.cpp
+++ b/clang/lib/Frontend/ModuleDependencyCollector.cpp
@@ -91,10 +91,10 @@ void ModuleDependencyCollector::attachToPreprocessor(Preprocessor &PP) {
       std::make_unique<ModuleDependencyMMCallbacks>(*this));
 }
 
-static bool isCaseSensitivePath(StringRef Path) {
+static bool isCaseSensitivePath(llvm::vfs::FileSystem &VFS, StringRef Path) {
   SmallString<256> TmpDest = Path, UpperDest, RealDest;
   // Remove component traversals, links, etc.
-  if (llvm::sys::fs::real_path(Path, TmpDest))
+  if (VFS.getRealPath(Path, TmpDest))
     return true; // Current default value in vfs.yaml
   Path = TmpDest;
 
@@ -104,7 +104,7 @@ static bool isCaseSensitivePath(StringRef Path) {
   // already expects when sensitivity isn't setup.
   for (auto &C : Path)
     UpperDest.push_back(toUppercase(C));
-  if (!llvm::sys::fs::real_path(UpperDest, RealDest) && Path == RealDest)
+  if (!VFS.getRealPath(UpperDest, RealDest) && Path == RealDest)
     return false;
   return true;
 }
@@ -121,7 +121,8 @@ void ModuleDependencyCollector::writeFileMap() {
 
   // Explicitly set case sensitivity for the YAML writer. For that, find out
   // the sensitivity at the path where the headers all collected to.
-  VFSWriter.setCaseSensitivity(isCaseSensitivePath(VFSDir));
+  VFSWriter.setCaseSensitivity(
+      isCaseSensitivePath(Canonicalizer.getFileSystem(), VFSDir));
 
   // Do not rely on real path names when executing the crash reproducer scripts
   // since we only want to actually use the files we have on the VFS cache.
@@ -153,7 +154,7 @@ std::error_code ModuleDependencyCollector::copyToRoot(StringRef Src,
   } else {
     // When collecting entries from input vfsoverlays, copy the external
     // contents into the cache but still map from the source.
-    if (!fs::exists(Dst))
+    if (!Canonicalizer.getFileSystem().exists(Dst))
       return std::error_code();
     path::append(CacheDst, Dst);
     Paths.CopyFrom = Dst;
diff --git a/clang/lib/Headers/avxintrin.h b/clang/lib/Headers/avxintrin.h
index a7f7099..d6ba19a 100644
--- a/clang/lib/Headers/avxintrin.h
+++ b/clang/lib/Headers/avxintrin.h
@@ -2311,10 +2311,9 @@ _mm256_cvttps_epi32(__m256 __a)
 /// \param __a
 ///    A 256-bit vector of [4 x double].
 /// \returns A 64 bit double containing the first element of the input vector.
-static __inline double __DEFAULT_FN_ATTRS
-_mm256_cvtsd_f64(__m256d __a)
-{
- return __a[0];
+static __inline double __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm256_cvtsd_f64(__m256d __a) {
+  return __a[0];
 }
 
 /// Returns the first element of the input vector of [8 x i32].
@@ -2327,11 +2326,10 @@ _mm256_cvtsd_f64(__m256d __a)
 /// \param __a
 ///    A 256-bit vector of [8 x i32].
 /// \returns A 32 bit integer containing the first element of the input vector.
-static __inline int __DEFAULT_FN_ATTRS
-_mm256_cvtsi256_si32(__m256i __a)
-{
- __v8si __b = (__v8si)__a;
- return __b[0];
+static __inline int __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm256_cvtsi256_si32(__m256i __a) {
+  __v8si __b = (__v8si)__a;
+  return __b[0];
 }
 
 /// Returns the first element of the input vector of [8 x float].
@@ -2344,10 +2342,9 @@ _mm256_cvtsi256_si32(__m256i __a)
 /// \param __a
 ///    A 256-bit vector of [8 x float].
 /// \returns A 32 bit float containing the first element of the input vector.
-static __inline float __DEFAULT_FN_ATTRS
-_mm256_cvtss_f32(__m256 __a)
-{
- return __a[0];
+static __inline float __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm256_cvtss_f32(__m256 __a) {
+  return __a[0];
 }
 
 /* Vector replicate */
diff --git a/clang/lib/Parse/ParseExprCXX.cpp b/clang/lib/Parse/ParseExprCXX.cpp
index 8605ba2..a2c6957 100644
--- a/clang/lib/Parse/ParseExprCXX.cpp
+++ b/clang/lib/Parse/ParseExprCXX.cpp
@@ -1299,7 +1299,7 @@ ExprResult Parser::ParseLambdaExpressionAfterIntroducer(
     Diag(Tok, getLangOpts().CPlusPlus23
                   ? diag::warn_cxx20_compat_decl_attrs_on_lambda
                   : diag::ext_decl_attrs_on_lambda)
-        << Tok.getIdentifierInfo() << Tok.isRegularKeywordAttribute();
+        << Tok.isRegularKeywordAttribute() << Tok.getIdentifierInfo();
     MaybeParseCXX11Attributes(D);
   }
 
diff --git a/clang/lib/Parse/ParseOpenMP.cpp b/clang/lib/Parse/ParseOpenMP.cpp
index 02f3f10..04f29c8 100644
--- a/clang/lib/Parse/ParseOpenMP.cpp
+++ b/clang/lib/Parse/ParseOpenMP.cpp
@@ -2968,6 +2968,39 @@ OMPClause *Parser::ParseOpenMPSizesClause() {
                                                  OpenLoc, CloseLoc);
 }
 
+OMPClause *Parser::ParseOpenMPLoopRangeClause() {
+  SourceLocation ClauseNameLoc = ConsumeToken();
+  SourceLocation FirstLoc, CountLoc;
+
+  BalancedDelimiterTracker T(*this, tok::l_paren, tok::annot_pragma_openmp_end);
+  if (T.consumeOpen()) {
+    Diag(Tok, diag::err_expected) << tok::l_paren;
+    return nullptr;
+  }
+
+  FirstLoc = Tok.getLocation();
+  ExprResult FirstVal = ParseConstantExpression();
+  if (!FirstVal.isUsable()) {
+    T.skipToEnd();
+    return nullptr;
+  }
+
+  ExpectAndConsume(tok::comma);
+
+  CountLoc = Tok.getLocation();
+  ExprResult CountVal = ParseConstantExpression();
+  if (!CountVal.isUsable()) {
+    T.skipToEnd();
+    return nullptr;
+  }
+
+  T.consumeClose();
+
+  return Actions.OpenMP().ActOnOpenMPLoopRangeClause(
+      FirstVal.get(), CountVal.get(), ClauseNameLoc, T.getOpenLocation(),
+      FirstLoc, CountLoc, T.getCloseLocation());
+}
+
 OMPClause *Parser::ParseOpenMPPermutationClause() {
   SourceLocation ClauseNameLoc, OpenLoc, CloseLoc;
   SmallVector<Expr *> ArgExprs;
@@ -3473,6 +3506,9 @@ OMPClause *Parser::ParseOpenMPClause(OpenMPDirectiveKind DKind,
     }
     Clause = ParseOpenMPClause(CKind, WrongDirective);
     break;
+  case OMPC_looprange:
+    Clause = ParseOpenMPLoopRangeClause();
+    break;
   default:
     break;
   }
diff --git a/clang/lib/Sema/AnalysisBasedWarnings.cpp b/clang/lib/Sema/AnalysisBasedWarnings.cpp
index 1b66d83..8606227 100644
--- a/clang/lib/Sema/AnalysisBasedWarnings.cpp
+++ b/clang/lib/Sema/AnalysisBasedWarnings.cpp
@@ -983,10 +983,9 @@ static void DiagUninitUse(Sema &S, const VarDecl *VD, const UninitUse &Use,
   case UninitUse::AfterDecl:
   case UninitUse::AfterCall:
     S.Diag(VD->getLocation(), diag::warn_sometimes_uninit_var)
-      << VD->getDeclName() << IsCapturedByBlock
-      << (Use.getKind() == UninitUse::AfterDecl ? 4 : 5)
-      << const_cast<DeclContext*>(VD->getLexicalDeclContext())
-      << VD->getSourceRange();
+        << VD->getDeclName() << IsCapturedByBlock
+        << (Use.getKind() == UninitUse::AfterDecl ? 4 : 5)
+        << VD->getLexicalDeclContext() << VD->getSourceRange();
     S.Diag(Use.getUser()->getBeginLoc(), diag::note_uninit_var_use)
         << IsCapturedByBlock << Use.getUser()->getSourceRange();
     return;
diff --git a/clang/lib/Sema/SemaConcept.cpp b/clang/lib/Sema/SemaConcept.cpp
index d238b79..dc6d232 100644
--- a/clang/lib/Sema/SemaConcept.cpp
+++ b/clang/lib/Sema/SemaConcept.cpp
@@ -193,7 +193,7 @@ DiagRecursiveConstraintEval(Sema &S, llvm::FoldingSetNodeID &ID,
   // Sema::InstantiatingTemplate::isAlreadyBeingInstantiated function.
   if (S.SatisfactionStackContains(Templ, ID)) {
     S.Diag(E->getExprLoc(), diag::err_constraint_depends_on_self)
-        << const_cast<Expr *>(E) << E->getSourceRange();
+        << E << E->getSourceRange();
     return true;
   }
 
diff --git a/clang/lib/Sema/SemaExceptionSpec.cpp b/clang/lib/Sema/SemaExceptionSpec.cpp
index 552c929..a0483c3 100644
--- a/clang/lib/Sema/SemaExceptionSpec.cpp
+++ b/clang/lib/Sema/SemaExceptionSpec.cpp
@@ -1493,6 +1493,7 @@ CanThrowResult Sema::canThrow(const Stmt *S) {
   case Stmt::OMPUnrollDirectiveClass:
   case Stmt::OMPReverseDirectiveClass:
   case Stmt::OMPInterchangeDirectiveClass:
+  case Stmt::OMPFuseDirectiveClass:
   case Stmt::OMPSingleDirectiveClass:
   case Stmt::OMPTargetDataDirectiveClass:
   case Stmt::OMPTargetDirectiveClass:
diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp
index 3b267c1..3302bfc 100644
--- a/clang/lib/Sema/SemaExpr.cpp
+++ b/clang/lib/Sema/SemaExpr.cpp
@@ -20108,8 +20108,9 @@ static void DoMarkVarDeclReferenced(
   bool NeededForConstantEvaluation =
       isPotentiallyConstantEvaluatedContext(SemaRef) && UsableInConstantExpr;
 
-  bool NeedDefinition =
-      OdrUse == OdrUseContext::Used || NeededForConstantEvaluation;
+  bool NeedDefinition = OdrUse == OdrUseContext::Used ||
+                        NeededForConstantEvaluation ||
+                        Var->getType()->isUndeducedType();
 
   assert(!isa<VarTemplatePartialSpecializationDecl>(Var) &&
          "Can't instantiate a partial template specialization.");
diff --git a/clang/lib/Sema/SemaOpenACCAtomic.cpp b/clang/lib/Sema/SemaOpenACCAtomic.cpp
index a9319dc..ad21129 100644
--- a/clang/lib/Sema/SemaOpenACCAtomic.cpp
+++ b/clang/lib/Sema/SemaOpenACCAtomic.cpp
@@ -454,9 +454,7 @@ class AtomicOperandChecker {
     // If nothing matches, error out.
     DiagnoseInvalidAtomic(BinInf->FoundExpr->getExprLoc(),
                           SemaRef.PDiag(diag::note_acc_atomic_mismatch_operand)
-                              << const_cast<Expr *>(AssignInf.LHS)
-                              << const_cast<Expr *>(BinInf->LHS)
-                              << const_cast<Expr *>(BinInf->RHS));
+                              << AssignInf.LHS << BinInf->LHS << BinInf->RHS);
     return IDACInfo::Fail();
   }
 
@@ -592,8 +590,7 @@ class AtomicOperandChecker {
 
     PartialDiagnostic PD =
         SemaRef.PDiag(diag::note_acc_atomic_mismatch_compound_operand)
-        << FirstKind << const_cast<Expr *>(FirstX) << SecondKind
-        << const_cast<Expr *>(SecondX);
+        << FirstKind << FirstX << SecondKind << SecondX;
 
     return DiagnoseInvalidAtomic(SecondX->getExprLoc(), PD);
   }
diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp
index 48e06d1..0fa21e8 100644
--- a/clang/lib/Sema/SemaOpenMP.cpp
+++ b/clang/lib/Sema/SemaOpenMP.cpp
@@ -2490,7 +2490,8 @@ VarDecl *SemaOpenMP::isOpenMPCapturedDecl(ValueDecl *D, bool CheckScopeInfo,
     DSAStackTy::DSAVarData DVarTop =
         DSAStack->getTopDSA(D, DSAStack->isClauseParsingMode());
     if (DVarTop.CKind != OMPC_unknown && isOpenMPPrivate(DVarTop.CKind) &&
-        (!VD || VD->hasLocalStorage() || !DVarTop.AppliedToPointee))
+        (!VD || VD->hasLocalStorage() ||
+         !(DVarTop.AppliedToPointee && DVarTop.CKind != OMPC_reduction)))
       return VD ? VD : cast<VarDecl>(DVarTop.PrivateCopy->getDecl());
     // Threadprivate variables must not be captured.
     if (isOpenMPThreadPrivate(DVarTop.CKind))
@@ -4569,6 +4570,7 @@ void SemaOpenMP::ActOnOpenMPRegionStart(OpenMPDirectiveKind DKind,
   case OMPD_unroll:
   case OMPD_reverse:
   case OMPD_interchange:
+  case OMPD_fuse:
   case OMPD_assume:
     break;
   default:
@@ -6410,6 +6412,10 @@ StmtResult SemaOpenMP::ActOnOpenMPExecutableDirective(
     Res = ActOnOpenMPInterchangeDirective(ClausesWithImplicit, AStmt, StartLoc,
                                           EndLoc);
     break;
+  case OMPD_fuse:
+    Res =
+        ActOnOpenMPFuseDirective(ClausesWithImplicit, AStmt, StartLoc, EndLoc);
+    break;
   case OMPD_for:
     Res = ActOnOpenMPForDirective(ClausesWithImplicit, AStmt, StartLoc, EndLoc,
                                   VarsWithInheritedDSA);
@@ -9488,7 +9494,9 @@ static bool checkOpenMPIterationSpace(
     // sharing attributes.
     VarsWithImplicitDSA.erase(LCDecl);
 
-    assert(isOpenMPLoopDirective(DKind) && "DSA for non-loop vars");
+    assert((isOpenMPLoopDirective(DKind) ||
+            isOpenMPCanonicalLoopSequenceTransformationDirective(DKind)) &&
+           "DSA for non-loop vars");
 
     // Check test-expr.
     HasErrors |= ISC.checkAndSetCond(For ? For->getCond() : CXXFor->getCond());
@@ -9916,7 +9924,8 @@ checkOpenMPLoop(OpenMPDirectiveKind DKind, Expr *CollapseLoopCountExpr,
   unsigned NumLoops = std::max(OrderedLoopCount, NestedLoopCount);
   SmallVector<LoopIterationSpace, 4> IterSpaces(NumLoops);
   if (!OMPLoopBasedDirective::doForAllLoops(
-          AStmt->IgnoreContainers(!isOpenMPLoopTransformationDirective(DKind)),
+          AStmt->IgnoreContainers(
+              !isOpenMPCanonicalLoopNestTransformationDirective(DKind)),
           SupportsNonPerfectlyNested, NumLoops,
           [DKind, &SemaRef, &DSA, NumLoops, NestedLoopCount,
            CollapseLoopCountExpr, OrderedLoopCountExpr, &VarsWithImplicitDSA,
@@ -9938,8 +9947,7 @@ checkOpenMPLoop(OpenMPDirectiveKind DKind, Expr *CollapseLoopCountExpr,
             }
             return false;
           },
-          [&SemaRef,
-           &Captures](OMPCanonicalLoopNestTransformationDirective *Transform) {
+          [&SemaRef, &Captures](OMPLoopTransformationDirective *Transform) {
             Stmt *DependentPreInits = Transform->getPreInits();
             if (!DependentPreInits)
               return;
@@ -9954,7 +9962,8 @@ checkOpenMPLoop(OpenMPDirectiveKind DKind, Expr *CollapseLoopCountExpr,
                   auto *D = cast<VarDecl>(C);
                   DeclRefExpr *Ref = buildDeclRefExpr(
                       SemaRef, D, D->getType().getNonReferenceType(),
-                      Transform->getBeginLoc());
+                      cast<OMPExecutableDirective>(Transform->getDirective())
+                          ->getBeginLoc());
                   Captures[Ref] = Ref;
                 }
               }
@@ -14404,10 +14413,34 @@ StmtResult SemaOpenMP::ActOnOpenMPTargetTeamsDistributeSimdDirective(
       getASTContext(), StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B);
 }
 
+/// Updates OriginalInits by checking Transform against loop transformation
+/// directives and appending their pre-inits if a match is found.
+static void updatePreInits(OMPLoopTransformationDirective *Transform,
+                           SmallVectorImpl<Stmt *> &PreInits) {
+  Stmt *Dir = Transform->getDirective();
+  switch (Dir->getStmtClass()) {
+#define STMT(CLASS, PARENT)
+#define ABSTRACT_STMT(CLASS)
+#define COMMON_OMP_LOOP_TRANSFORMATION(CLASS, PARENT)                          \
+  case Stmt::CLASS##Class:                                                     \
+    appendFlattenedStmtList(PreInits,                                          \
+                            static_cast<const CLASS *>(Dir)->getPreInits());   \
+    break;
+#define OMPCANONICALLOOPNESTTRANSFORMATIONDIRECTIVE(CLASS, PARENT)             \
+  COMMON_OMP_LOOP_TRANSFORMATION(CLASS, PARENT)
+#define OMPCANONICALLOOPSEQUENCETRANSFORMATIONDIRECTIVE(CLASS, PARENT)         \
+  COMMON_OMP_LOOP_TRANSFORMATION(CLASS, PARENT)
+#include "clang/AST/StmtNodes.inc"
+#undef COMMON_OMP_LOOP_TRANSFORMATION
+  default:
+    llvm_unreachable("Not a loop transformation");
+  }
+}
+
 bool SemaOpenMP::checkTransformableLoopNest(
     OpenMPDirectiveKind Kind, Stmt *AStmt, int NumLoops,
     SmallVectorImpl<OMPLoopBasedDirective::HelperExprs> &LoopHelpers,
-    Stmt *&Body, SmallVectorImpl<SmallVector<Stmt *, 0>> &OriginalInits) {
+    Stmt *&Body, SmallVectorImpl<SmallVector<Stmt *>> &OriginalInits) {
   OriginalInits.emplace_back();
   bool Result = OMPLoopBasedDirective::doForAllLoops(
       AStmt->IgnoreContainers(), /*TryImperfectlyNestedLoops=*/false, NumLoops,
@@ -14433,29 +14466,268 @@ bool SemaOpenMP::checkTransformableLoopNest(
         OriginalInits.emplace_back();
         return false;
       },
-      [&OriginalInits](OMPLoopBasedDirective *Transform) {
-        Stmt *DependentPreInits;
-        if (auto *Dir = dyn_cast<OMPTileDirective>(Transform))
-          DependentPreInits = Dir->getPreInits();
-        else if (auto *Dir = dyn_cast<OMPStripeDirective>(Transform))
-          DependentPreInits = Dir->getPreInits();
-        else if (auto *Dir = dyn_cast<OMPUnrollDirective>(Transform))
-          DependentPreInits = Dir->getPreInits();
-        else if (auto *Dir = dyn_cast<OMPReverseDirective>(Transform))
-          DependentPreInits = Dir->getPreInits();
-        else if (auto *Dir = dyn_cast<OMPInterchangeDirective>(Transform))
-          DependentPreInits = Dir->getPreInits();
-        else
-          llvm_unreachable("Unhandled loop transformation");
-
-        appendFlattenedStmtList(OriginalInits.back(), DependentPreInits);
+      [&OriginalInits](OMPLoopTransformationDirective *Transform) {
+        updatePreInits(Transform, OriginalInits.back());
       });
   assert(OriginalInits.back().empty() && "No preinit after innermost loop");
   OriginalInits.pop_back();
   return Result;
 }
 
-/// Add preinit statements that need to be propageted from the selected loop.
+/// Counts the total number of OpenMP canonical nested loops, including the
+/// outermost loop (the original loop). PRECONDITION of this visitor is that it
+/// must be invoked from the original loop to be analyzed. The traversal stops
+/// for Decl's and Expr's given that they may contain inner loops that must not
+/// be counted.
+///
+/// Example AST structure for the code:
+///
+/// int main() {
+///     #pragma omp fuse
+///     {
+///         for (int i = 0; i < 100; i++) {    <-- Outer loop
+///             []() {
+///                 for(int j = 0; j < 100; j++) {}  <-- NOT A LOOP (1)
+///             };
+///             for(int j = 0; j < 5; ++j) {}    <-- Inner loop
+///         }
+///         for (int r = 0; i < 100; i++) {    <-- Outer loop
+///             struct LocalClass {
+///                 void bar() {
+///                     for(int j = 0; j < 100; j++) {}  <-- NOT A LOOP (2)
+///                 }
+///             };
+///             for(int k = 0; k < 10; ++k) {}    <-- Inner loop
+///             {x = 5; for(k = 0; k < 10; ++k) x += k; x}; <-- NOT A LOOP (3)
+///         }
+///     }
+/// }
+/// (1) because in a different function (here: a lambda)
+/// (2) because in a different function (here: class method)
+/// (3) because considered to be intervening-code of non-perfectly nested loop
+/// Result: Loop 'i' contains 2 loops, Loop 'r' also contains 2 loops.
+class NestedLoopCounterVisitor final : public DynamicRecursiveASTVisitor {
+private:
+  unsigned NestedLoopCount = 0;
+
+public:
+  explicit NestedLoopCounterVisitor() = default;
+
+  unsigned getNestedLoopCount() const { return NestedLoopCount; }
+
+  bool VisitForStmt(ForStmt *FS) override {
+    ++NestedLoopCount;
+    return true;
+  }
+
+  bool VisitCXXForRangeStmt(CXXForRangeStmt *FRS) override {
+    ++NestedLoopCount;
+    return true;
+  }
+
+  bool TraverseStmt(Stmt *S) override {
+    if (!S)
+      return true;
+
+    // Skip traversal of all expressions, including special cases like
+    // LambdaExpr, StmtExpr, BlockExpr, and RequiresExpr. These expressions
+    // may contain inner statements (and even loops), but they are not part
+    // of the syntactic body of the surrounding loop structure.
+    //  Therefore must not be counted.
+    if (isa<Expr>(S))
+      return true;
+
+    // Only recurse into CompoundStmt (block {}) and loop bodies.
+    if (isa<CompoundStmt, ForStmt, CXXForRangeStmt>(S)) {
+      return DynamicRecursiveASTVisitor::TraverseStmt(S);
+    }
+
+    // Stop traversal of the rest of statements, that break perfect
+    // loop nesting, such as control flow (IfStmt, SwitchStmt...).
+    return true;
+  }
+
+  bool TraverseDecl(Decl *D) override {
+    // Stop in the case of finding a declaration, it is not important
+    // in order to find nested loops (Possible CXXRecordDecl, RecordDecl,
+    // FunctionDecl...).
+    return true;
+  }
+};
+
+bool SemaOpenMP::analyzeLoopSequence(Stmt *LoopSeqStmt,
+                                     LoopSequenceAnalysis &SeqAnalysis,
+                                     ASTContext &Context,
+                                     OpenMPDirectiveKind Kind) {
+  VarsWithInheritedDSAType TmpDSA;
+  // Helper Lambda to handle storing initialization and body statements for
+  // both ForStmt and CXXForRangeStmt.
+  auto StoreLoopStatements = [](LoopAnalysis &Analysis, Stmt *LoopStmt) {
+    if (auto *For = dyn_cast<ForStmt>(LoopStmt)) {
+      Analysis.OriginalInits.push_back(For->getInit());
+      Analysis.TheForStmt = For;
+    } else {
+      auto *CXXFor = cast<CXXForRangeStmt>(LoopStmt);
+      Analysis.OriginalInits.push_back(CXXFor->getBeginStmt());
+      Analysis.TheForStmt = CXXFor;
+    }
+  };
+
+  // Helper lambda functions to encapsulate the processing of different
+  // derivations of the canonical loop sequence grammar
+  // Modularized code for handling loop generation and transformations.
+  auto AnalyzeLoopGeneration = [&](Stmt *Child) {
+    auto *LoopTransform = cast<OMPLoopTransformationDirective>(Child);
+    Stmt *TransformedStmt = LoopTransform->getTransformedStmt();
+    unsigned NumGeneratedTopLevelLoops =
+        LoopTransform->getNumGeneratedTopLevelLoops();
+    // Handle the case where transformed statement is not available due to
+    // dependent contexts
+    if (!TransformedStmt) {
+      if (NumGeneratedTopLevelLoops > 0) {
+        SeqAnalysis.LoopSeqSize += NumGeneratedTopLevelLoops;
+        return true;
+      }
+      // Unroll full (0 loops produced)
+      Diag(Child->getBeginLoc(), diag::err_omp_not_for)
+          << 0 << getOpenMPDirectiveName(Kind);
+      return false;
+    }
+    // Handle loop transformations with multiple loop nests
+    // Unroll full
+    if (!NumGeneratedTopLevelLoops) {
+      Diag(Child->getBeginLoc(), diag::err_omp_not_for)
+          << 0 << getOpenMPDirectiveName(Kind);
+      return false;
+    }
+    // Loop transformatons such as split or loopranged fuse
+    if (NumGeneratedTopLevelLoops > 1) {
+      // Get the preinits related to this loop sequence generating
+      // loop transformation (i.e loopranged fuse, split...)
+      // These preinits differ slightly from regular inits/pre-inits related
+      // to single loop generating loop transformations (interchange, unroll)
+      // given that they are not bounded to a particular loop nest
+      // so they need to be treated independently
+      updatePreInits(LoopTransform, SeqAnalysis.LoopSequencePreInits);
+      return analyzeLoopSequence(TransformedStmt, SeqAnalysis, Context, Kind);
+    }
+    // Vast majority: (Tile, Unroll, Stripe, Reverse, Interchange, Fuse all)
+    // Process the transformed loop statement
+    LoopAnalysis &NewTransformedSingleLoop =
+        SeqAnalysis.Loops.emplace_back(Child);
+    unsigned IsCanonical = checkOpenMPLoop(
+        Kind, nullptr, nullptr, TransformedStmt, SemaRef, *DSAStack, TmpDSA,
+        NewTransformedSingleLoop.HelperExprs);
+
+    if (!IsCanonical)
+      return false;
+
+    StoreLoopStatements(NewTransformedSingleLoop, TransformedStmt);
+    updatePreInits(LoopTransform, NewTransformedSingleLoop.TransformsPreInits);
+
+    SeqAnalysis.LoopSeqSize++;
+    return true;
+  };
+
+  // Modularized code for handling regular canonical loops.
+  auto AnalyzeRegularLoop = [&](Stmt *Child) {
+    LoopAnalysis &NewRegularLoop = SeqAnalysis.Loops.emplace_back(Child);
+    unsigned IsCanonical =
+        checkOpenMPLoop(Kind, nullptr, nullptr, Child, SemaRef, *DSAStack,
+                        TmpDSA, NewRegularLoop.HelperExprs);
+
+    if (!IsCanonical)
+      return false;
+
+    StoreLoopStatements(NewRegularLoop, Child);
+    NestedLoopCounterVisitor NLCV;
+    NLCV.TraverseStmt(Child);
+    return true;
+  };
+
+  // High level grammar validation.
+  for (Stmt *Child : LoopSeqStmt->children()) {
+    if (!Child)
+      continue;
+    // Skip over non-loop-sequence statements.
+    if (!LoopSequenceAnalysis::isLoopSequenceDerivation(Child)) {
+      Child = Child->IgnoreContainers();
+      // Ignore empty compound statement.
+      if (!Child)
+        continue;
+      // In the case of a nested loop sequence ignoring containers would not
+      // be enough, a recurisve transversal of the loop sequence is required.
+      if (isa<CompoundStmt>(Child)) {
+        if (!analyzeLoopSequence(Child, SeqAnalysis, Context, Kind))
+          return false;
+        // Already been treated, skip this children
+        continue;
+      }
+    }
+    // Regular loop sequence handling.
+    if (LoopSequenceAnalysis::isLoopSequenceDerivation(Child)) {
+      if (LoopAnalysis::isLoopTransformation(Child)) {
+        if (!AnalyzeLoopGeneration(Child))
+          return false;
+        // AnalyzeLoopGeneration updates SeqAnalysis.LoopSeqSize accordingly.
+      } else {
+        if (!AnalyzeRegularLoop(Child))
+          return false;
+        SeqAnalysis.LoopSeqSize++;
+      }
+    } else {
+      // Report error for invalid statement inside canonical loop sequence.
+      Diag(Child->getBeginLoc(), diag::err_omp_not_for)
+          << 0 << getOpenMPDirectiveName(Kind);
+      return false;
+    }
+  }
+  return true;
+}
+
+bool SemaOpenMP::checkTransformableLoopSequence(
+    OpenMPDirectiveKind Kind, Stmt *AStmt, LoopSequenceAnalysis &SeqAnalysis,
+    ASTContext &Context) {
+  // Following OpenMP 6.0 API Specification, a Canonical Loop Sequence follows
+  // the grammar:
+  //
+  // canonical-loop-sequence:
+  //  {
+  //    loop-sequence+
+  //  }
+  // where loop-sequence can be any of the following:
+  // 1. canonical-loop-sequence
+  // 2. loop-nest
+  // 3. loop-sequence-generating-construct (i.e OMPLoopTransformationDirective)
+  //
+  // To recognise and traverse this structure the helper function
+  // analyzeLoopSequence serves as the recurisve entry point
+  // and tries to match the input AST to the canonical loop sequence grammar
+  // structure. This function will perform both a semantic and syntactical
+  // analysis of the given statement according to OpenMP 6.0 definition of
+  // the aforementioned canonical loop sequence.
+
+  // We expect an outer compound statement.
+  if (!isa<CompoundStmt>(AStmt)) {
+    Diag(AStmt->getBeginLoc(), diag::err_omp_not_a_loop_sequence)
+        << getOpenMPDirectiveName(Kind);
+    return false;
+  }
+
+  // Recursive entry point to process the main loop sequence
+  if (!analyzeLoopSequence(AStmt, SeqAnalysis, Context, Kind))
+    return false;
+
+  // Diagnose an empty loop sequence.
+  if (!SeqAnalysis.LoopSeqSize) {
+    Diag(AStmt->getBeginLoc(), diag::err_omp_empty_loop_sequence)
+        << getOpenMPDirectiveName(Kind);
+    return false;
+  }
+  return true;
+}
+
+/// Add preinit statements that need to be propagated from the selected loop.
 static void addLoopPreInits(ASTContext &Context,
                             OMPLoopBasedDirective::HelperExprs &LoopHelper,
                             Stmt *LoopStmt, ArrayRef<Stmt *> OriginalInit,
@@ -14540,7 +14812,7 @@ StmtResult SemaOpenMP::ActOnOpenMPTileDirective(ArrayRef<OMPClause *> Clauses,
   // Verify and diagnose loop nest.
   SmallVector<OMPLoopBasedDirective::HelperExprs, 4> LoopHelpers(NumLoops);
   Stmt *Body = nullptr;
-  SmallVector<SmallVector<Stmt *, 0>, 4> OriginalInits;
+  SmallVector<SmallVector<Stmt *>, 4> OriginalInits;
   if (!checkTransformableLoopNest(OMPD_tile, AStmt, NumLoops, LoopHelpers, Body,
                                   OriginalInits))
     return StmtError();
@@ -14817,7 +15089,7 @@ StmtResult SemaOpenMP::ActOnOpenMPStripeDirective(ArrayRef<OMPClause *> Clauses,
   // Verify and diagnose loop nest.
   SmallVector<OMPLoopBasedDirective::HelperExprs, 4> LoopHelpers(NumLoops);
   Stmt *Body = nullptr;
-  SmallVector<SmallVector<Stmt *, 0>, 4> OriginalInits;
+  SmallVector<SmallVector<Stmt *>, 4> OriginalInits;
   if (!checkTransformableLoopNest(OMPD_stripe, AStmt, NumLoops, LoopHelpers,
                                   Body, OriginalInits))
     return StmtError();
@@ -15078,7 +15350,7 @@ StmtResult SemaOpenMP::ActOnOpenMPUnrollDirective(ArrayRef<OMPClause *> Clauses,
   Stmt *Body = nullptr;
   SmallVector<OMPLoopBasedDirective::HelperExprs, NumLoops> LoopHelpers(
       NumLoops);
-  SmallVector<SmallVector<Stmt *, 0>, NumLoops + 1> OriginalInits;
+  SmallVector<SmallVector<Stmt *>, NumLoops + 1> OriginalInits;
   if (!checkTransformableLoopNest(OMPD_unroll, AStmt, NumLoops, LoopHelpers,
                                   Body, OriginalInits))
     return StmtError();
@@ -15348,7 +15620,7 @@ StmtResult SemaOpenMP::ActOnOpenMPReverseDirective(Stmt *AStmt,
   Stmt *Body = nullptr;
   SmallVector<OMPLoopBasedDirective::HelperExprs, NumLoops> LoopHelpers(
       NumLoops);
-  SmallVector<SmallVector<Stmt *, 0>, NumLoops + 1> OriginalInits;
+  SmallVector<SmallVector<Stmt *>, NumLoops + 1> OriginalInits;
   if (!checkTransformableLoopNest(OMPD_reverse, AStmt, NumLoops, LoopHelpers,
                                   Body, OriginalInits))
     return StmtError();
@@ -15540,7 +15812,7 @@ StmtResult SemaOpenMP::ActOnOpenMPInterchangeDirective(
   // Verify and diagnose loop nest.
   SmallVector<OMPLoopBasedDirective::HelperExprs, 4> LoopHelpers(NumLoops);
   Stmt *Body = nullptr;
-  SmallVector<SmallVector<Stmt *, 0>, 2> OriginalInits;
+  SmallVector<SmallVector<Stmt *>, 2> OriginalInits;
   if (!checkTransformableLoopNest(OMPD_interchange, AStmt, NumLoops,
                                   LoopHelpers, Body, OriginalInits))
     return StmtError();
@@ -15716,6 +15988,484 @@ StmtResult SemaOpenMP::ActOnOpenMPInterchangeDirective(
                                          buildPreInits(Context, PreInits));
 }
 
+StmtResult SemaOpenMP::ActOnOpenMPFuseDirective(ArrayRef<OMPClause *> Clauses,
+                                                Stmt *AStmt,
+                                                SourceLocation StartLoc,
+                                                SourceLocation EndLoc) {
+
+  ASTContext &Context = getASTContext();
+  DeclContext *CurrContext = SemaRef.CurContext;
+  Scope *CurScope = SemaRef.getCurScope();
+  CaptureVars CopyTransformer(SemaRef);
+
+  // Ensure the structured block is not empty
+  if (!AStmt)
+    return StmtError();
+
+  // Defer transformation in dependent contexts
+  // The NumLoopNests argument is set to a placeholder 1 (even though
+  // using looprange fuse could yield up to 3 top level loop nests)
+  // because a dependent context could prevent determining its true value
+  if (CurrContext->isDependentContext())
+    return OMPFuseDirective::Create(Context, StartLoc, EndLoc, Clauses,
+                                    /* NumLoops */ 1, AStmt, nullptr, nullptr);
+
+  // Validate that the potential loop sequence is transformable for fusion
+  // Also collect the HelperExprs, Loop Stmts, Inits, and Number of loops
+  LoopSequenceAnalysis SeqAnalysis;
+  if (!checkTransformableLoopSequence(OMPD_fuse, AStmt, SeqAnalysis, Context))
+    return StmtError();
+
+  // SeqAnalysis.LoopSeqSize exists mostly to handle dependent contexts,
+  // otherwise it must be the same as SeqAnalysis.Loops.size().
+  assert(SeqAnalysis.LoopSeqSize == SeqAnalysis.Loops.size() &&
+         "Inconsistent size of the loop sequence and the number of loops "
+         "found in the sequence");
+
+  // Handle clauses, which can be any of the following: [looprange, apply]
+  const auto *LRC =
+      OMPExecutableDirective::getSingleClause<OMPLoopRangeClause>(Clauses);
+
+  // The clause arguments are invalidated if any error arises
+  // such as non-constant or non-positive arguments
+  if (LRC && (!LRC->getFirst() || !LRC->getCount()))
+    return StmtError();
+
+  // Delayed semantic check of LoopRange constraint
+  // Evaluates the loop range arguments and returns the first and count values
+  auto EvaluateLoopRangeArguments = [&Context](Expr *First, Expr *Count,
+                                               uint64_t &FirstVal,
+                                               uint64_t &CountVal) {
+    llvm::APSInt FirstInt = First->EvaluateKnownConstInt(Context);
+    llvm::APSInt CountInt = Count->EvaluateKnownConstInt(Context);
+    FirstVal = FirstInt.getZExtValue();
+    CountVal = CountInt.getZExtValue();
+  };
+
+  // OpenMP [6.0, Restrictions]
+  // first + count - 1 must not evaluate to a value greater than the
+  // loop sequence length of the associated canonical loop sequence.
+  auto ValidLoopRange = [](uint64_t FirstVal, uint64_t CountVal,
+                           unsigned NumLoops) -> bool {
+    return FirstVal + CountVal - 1 <= NumLoops;
+  };
+  uint64_t FirstVal = 1, CountVal = 0, LastVal = SeqAnalysis.LoopSeqSize;
+
+  // Validates the loop range after evaluating the semantic information
+  // and ensures that the range is valid for the given loop sequence size.
+  // Expressions are evaluated at compile time to obtain constant values.
+  if (LRC) {
+    EvaluateLoopRangeArguments(LRC->getFirst(), LRC->getCount(), FirstVal,
+                               CountVal);
+    if (CountVal == 1)
+      SemaRef.Diag(LRC->getCountLoc(), diag::warn_omp_redundant_fusion)
+          << getOpenMPDirectiveName(OMPD_fuse);
+
+    if (!ValidLoopRange(FirstVal, CountVal, SeqAnalysis.LoopSeqSize)) {
+      SemaRef.Diag(LRC->getFirstLoc(), diag::err_omp_invalid_looprange)
+          << getOpenMPDirectiveName(OMPD_fuse) << FirstVal
+          << (FirstVal + CountVal - 1) << SeqAnalysis.LoopSeqSize;
+      return StmtError();
+    }
+
+    LastVal = FirstVal + CountVal - 1;
+  }
+
+  // Complete fusion generates a single canonical loop nest
+  // However looprange clause may generate several loop nests
+  unsigned NumGeneratedTopLevelLoops =
+      LRC ? SeqAnalysis.LoopSeqSize - CountVal + 1 : 1;
+
+  // Emit a warning for redundant loop fusion when the sequence contains only
+  // one loop.
+  if (SeqAnalysis.LoopSeqSize == 1)
+    SemaRef.Diag(AStmt->getBeginLoc(), diag::warn_omp_redundant_fusion)
+        << getOpenMPDirectiveName(OMPD_fuse);
+
+  // Select the type with the largest bit width among all induction variables
+  QualType IVType =
+      SeqAnalysis.Loops[FirstVal - 1].HelperExprs.IterationVarRef->getType();
+  for (unsigned I : llvm::seq<unsigned>(FirstVal, LastVal)) {
+    QualType CurrentIVType =
+        SeqAnalysis.Loops[I].HelperExprs.IterationVarRef->getType();
+    if (Context.getTypeSize(CurrentIVType) > Context.getTypeSize(IVType)) {
+      IVType = CurrentIVType;
+    }
+  }
+  uint64_t IVBitWidth = Context.getIntWidth(IVType);
+
+  // Create pre-init declarations for all loops lower bounds, upper bounds,
+  // strides and num-iterations for every top level loop in the fusion
+  SmallVector<VarDecl *, 4> LBVarDecls;
+  SmallVector<VarDecl *, 4> STVarDecls;
+  SmallVector<VarDecl *, 4> NIVarDecls;
+  SmallVector<VarDecl *, 4> UBVarDecls;
+  SmallVector<VarDecl *, 4> IVVarDecls;
+
+  // Helper lambda to create variables for bounds, strides, and other
+  // expressions. Generates both the variable declaration and the corresponding
+  // initialization statement.
+  auto CreateHelperVarAndStmt =
+      [&, &SemaRef = SemaRef](Expr *ExprToCopy, const std::string &BaseName,
+                              unsigned I, bool NeedsNewVD = false) {
+        Expr *TransformedExpr =
+            AssertSuccess(CopyTransformer.TransformExpr(ExprToCopy));
+        if (!TransformedExpr)
+          return std::pair<VarDecl *, StmtResult>(nullptr, StmtError());
+
+        auto Name = (Twine(".omp.") + BaseName + std::to_string(I)).str();
+
+        VarDecl *VD;
+        if (NeedsNewVD) {
+          VD = buildVarDecl(SemaRef, SourceLocation(), IVType, Name);
+          SemaRef.AddInitializerToDecl(VD, TransformedExpr, false);
+        } else {
+          // Create a unique variable name
+          DeclRefExpr *DRE = cast<DeclRefExpr>(TransformedExpr);
+          VD = cast<VarDecl>(DRE->getDecl());
+          VD->setDeclName(&SemaRef.PP.getIdentifierTable().get(Name));
+        }
+        // Create the corresponding declaration statement
+        StmtResult DeclStmt = new (Context) class DeclStmt(
+            DeclGroupRef(VD), SourceLocation(), SourceLocation());
+        return std::make_pair(VD, DeclStmt);
+      };
+
+  // PreInits hold a sequence of variable declarations that must be executed
+  // before the fused loop begins. These include bounds, strides, and other
+  // helper variables required for the transformation. Other loop transforms
+  // also contain their own preinits
+  SmallVector<Stmt *> PreInits;
+
+  //  Update the general preinits using the preinits generated by loop sequence
+  //  generating loop transformations. These preinits differ slightly from
+  //  single-loop transformation preinits, as they can be detached from a
+  //  specific loop inside multiple generated loop nests. This happens
+  //  because certain helper variables, like '.omp.fuse.max', are introduced to
+  //  handle fused iteration spaces and may not be directly tied to a single
+  //  original loop. The preinit structure must ensure that hidden variables
+  //  like '.omp.fuse.max' are still properly handled.
+  // Transformations that apply this concept: Loopranged Fuse, Split
+  llvm::append_range(PreInits, SeqAnalysis.LoopSequencePreInits);
+
+  // Process each single loop to generate and collect declarations
+  // and statements for all helper expressions related to
+  // particular single loop nests
+
+  // Also In the case of the fused loops, we keep track of their original
+  // inits by appending them to their preinits statement, and in the case of
+  // transformations, also append their preinits (which contain the original
+  // loop initialization statement or other statements)
+
+  // Firstly we need to set TransformIndex to match the begining of the
+  // looprange section
+  unsigned int TransformIndex = 0;
+  for (unsigned I : llvm::seq<unsigned>(FirstVal - 1)) {
+    if (SeqAnalysis.Loops[I].isLoopTransformation())
+      ++TransformIndex;
+  }
+
+  for (unsigned int I = FirstVal - 1, J = 0; I < LastVal; ++I, ++J) {
+    if (SeqAnalysis.Loops[I].isRegularLoop()) {
+      addLoopPreInits(Context, SeqAnalysis.Loops[I].HelperExprs,
+                      SeqAnalysis.Loops[I].TheForStmt,
+                      SeqAnalysis.Loops[I].OriginalInits, PreInits);
+    } else if (SeqAnalysis.Loops[I].isLoopTransformation()) {
+      // For transformed loops, insert both pre-inits and original inits.
+      // Order matters: pre-inits may define variables used in the original
+      // inits such as upper bounds...
+      SmallVector<Stmt *> &TransformPreInit =
+          SeqAnalysis.Loops[TransformIndex++].TransformsPreInits;
+      llvm::append_range(PreInits, TransformPreInit);
+
+      addLoopPreInits(Context, SeqAnalysis.Loops[I].HelperExprs,
+                      SeqAnalysis.Loops[I].TheForStmt,
+                      SeqAnalysis.Loops[I].OriginalInits, PreInits);
+    }
+    auto [UBVD, UBDStmt] =
+        CreateHelperVarAndStmt(SeqAnalysis.Loops[I].HelperExprs.UB, "ub", J);
+    auto [LBVD, LBDStmt] =
+        CreateHelperVarAndStmt(SeqAnalysis.Loops[I].HelperExprs.LB, "lb", J);
+    auto [STVD, STDStmt] =
+        CreateHelperVarAndStmt(SeqAnalysis.Loops[I].HelperExprs.ST, "st", J);
+    auto [NIVD, NIDStmt] = CreateHelperVarAndStmt(
+        SeqAnalysis.Loops[I].HelperExprs.NumIterations, "ni", J, true);
+    auto [IVVD, IVDStmt] = CreateHelperVarAndStmt(
+        SeqAnalysis.Loops[I].HelperExprs.IterationVarRef, "iv", J);
+
+    assert(LBVD && STVD && NIVD && IVVD &&
+           "OpenMP Fuse Helper variables creation failed");
+
+    UBVarDecls.push_back(UBVD);
+    LBVarDecls.push_back(LBVD);
+    STVarDecls.push_back(STVD);
+    NIVarDecls.push_back(NIVD);
+    IVVarDecls.push_back(IVVD);
+
+    PreInits.push_back(LBDStmt.get());
+    PreInits.push_back(STDStmt.get());
+    PreInits.push_back(NIDStmt.get());
+    PreInits.push_back(IVDStmt.get());
+  }
+
+  auto MakeVarDeclRef = [&SemaRef = this->SemaRef](VarDecl *VD) {
+    return buildDeclRefExpr(SemaRef, VD, VD->getType(), VD->getLocation(),
+                            false);
+  };
+
+  // Following up the creation of the final fused loop will be performed
+  // which has the following shape (considering the selected loops):
+  //
+  // for (fuse.index = 0; fuse.index < max(ni0, ni1..., nik); ++fuse.index) {
+  //    if (fuse.index < ni0){
+  //      iv0 = lb0 + st0 * fuse.index;
+  //      original.index0 = iv0
+  //      body(0);
+  //    }
+  //    if (fuse.index < ni1){
+  //      iv1 = lb1 + st1 * fuse.index;
+  //      original.index1 = iv1
+  //      body(1);
+  //    }
+  //
+  //    ...
+  //
+  //    if (fuse.index < nik){
+  //      ivk = lbk + stk * fuse.index;
+  //      original.indexk = ivk
+  //      body(k);  Expr *InitVal = IntegerLiteral::Create(Context,
+  //      llvm::APInt(IVWidth, 0),
+  //    }
+
+  // 1. Create the initialized fuse index
+  StringRef IndexName = ".omp.fuse.index";
+  Expr *InitVal = IntegerLiteral::Create(Context, llvm::APInt(IVBitWidth, 0),
+                                         IVType, SourceLocation());
+  VarDecl *IndexDecl =
+      buildVarDecl(SemaRef, {}, IVType, IndexName, nullptr, nullptr);
+  SemaRef.AddInitializerToDecl(IndexDecl, InitVal, false);
+  StmtResult InitStmt = new (Context)
+      DeclStmt(DeclGroupRef(IndexDecl), SourceLocation(), SourceLocation());
+
+  if (!InitStmt.isUsable())
+    return StmtError();
+
+  auto MakeIVRef = [&SemaRef = this->SemaRef, IndexDecl, IVType,
+                    Loc = InitVal->getExprLoc()]() {
+    return buildDeclRefExpr(SemaRef, IndexDecl, IVType, Loc, false);
+  };
+
+  // 2. Iteratively compute the max number of logical iterations Max(NI_1, NI_2,
+  // ..., NI_k)
+  //
+  // This loop accumulates the maximum value across multiple expressions,
+  // ensuring each step constructs a unique AST node for correctness. By using
+  // intermediate temporary variables and conditional operators, we maintain
+  // distinct nodes and avoid duplicating subtrees,  For instance, max(a,b,c):
+  //   omp.temp0 = max(a, b)
+  //   omp.temp1 = max(omp.temp0, c)
+  //   omp.fuse.max = max(omp.temp1, omp.temp0)
+
+  ExprResult MaxExpr;
+  // I is the range of loops in the sequence that we fuse.
+  for (unsigned I = FirstVal - 1, J = 0; I < LastVal; ++I, ++J) {
+    DeclRefExpr *NIRef = MakeVarDeclRef(NIVarDecls[J]);
+    QualType NITy = NIRef->getType();
+
+    if (MaxExpr.isUnset()) {
+      // Initialize MaxExpr with the first NI expression
+      MaxExpr = NIRef;
+    } else {
+      // Create a new acummulator variable t_i = MaxExpr
+      std::string TempName = (Twine(".omp.temp.") + Twine(J)).str();
+      VarDecl *TempDecl =
+          buildVarDecl(SemaRef, {}, NITy, TempName, nullptr, nullptr);
+      TempDecl->setInit(MaxExpr.get());
+      DeclRefExpr *TempRef =
+          buildDeclRefExpr(SemaRef, TempDecl, NITy, SourceLocation(), false);
+      DeclRefExpr *TempRef2 =
+          buildDeclRefExpr(SemaRef, TempDecl, NITy, SourceLocation(), false);
+      // Add a DeclStmt to PreInits to ensure the variable is declared.
+      StmtResult TempStmt = new (Context)
+          DeclStmt(DeclGroupRef(TempDecl), SourceLocation(), SourceLocation());
+
+      if (!TempStmt.isUsable())
+        return StmtError();
+      PreInits.push_back(TempStmt.get());
+
+      // Build MaxExpr <-(MaxExpr > NIRef ? MaxExpr : NIRef)
+      ExprResult Comparison =
+          SemaRef.BuildBinOp(nullptr, SourceLocation(), BO_GT, TempRef, NIRef);
+      // Handle any errors in Comparison creation
+      if (!Comparison.isUsable())
+        return StmtError();
+
+      DeclRefExpr *NIRef2 = MakeVarDeclRef(NIVarDecls[J]);
+      // Update MaxExpr using a conditional expression to hold the max value
+      MaxExpr = new (Context) ConditionalOperator(
+          Comparison.get(), SourceLocation(), TempRef2, SourceLocation(),
+          NIRef2->getExprStmt(), NITy, VK_LValue, OK_Ordinary);
+
+      if (!MaxExpr.isUsable())
+        return StmtError();
+    }
+  }
+  if (!MaxExpr.isUsable())
+    return StmtError();
+
+  // 3. Declare the max variable
+  const std::string MaxName = Twine(".omp.fuse.max").str();
+  VarDecl *MaxDecl =
+      buildVarDecl(SemaRef, {}, IVType, MaxName, nullptr, nullptr);
+  MaxDecl->setInit(MaxExpr.get());
+  DeclRefExpr *MaxRef = buildDeclRefExpr(SemaRef, MaxDecl, IVType, {}, false);
+  StmtResult MaxStmt = new (Context)
+      DeclStmt(DeclGroupRef(MaxDecl), SourceLocation(), SourceLocation());
+
+  if (MaxStmt.isInvalid())
+    return StmtError();
+  PreInits.push_back(MaxStmt.get());
+
+  // 4. Create condition Expr: index < n_max
+  ExprResult CondExpr = SemaRef.BuildBinOp(CurScope, SourceLocation(), BO_LT,
+                                           MakeIVRef(), MaxRef);
+  if (!CondExpr.isUsable())
+    return StmtError();
+
+  // 5. Increment Expr: ++index
+  ExprResult IncrExpr =
+      SemaRef.BuildUnaryOp(CurScope, SourceLocation(), UO_PreInc, MakeIVRef());
+  if (!IncrExpr.isUsable())
+    return StmtError();
+
+  // 6. Build the Fused Loop Body
+  // The final fused loop iterates over the maximum logical range. Inside the
+  // loop, each original loop's index is calculated dynamically, and its body
+  // is executed conditionally.
+  //
+  // Each sub-loop's body is guarded by a conditional statement to ensure
+  // it executes only within its logical iteration range:
+  //
+  //    if (fuse.index < ni_k){
+  //      iv_k = lb_k + st_k * fuse.index;
+  //      original.index = iv_k
+  //      body(k);
+  //    }
+
+  CompoundStmt *FusedBody = nullptr;
+  SmallVector<Stmt *, 4> FusedBodyStmts;
+  for (unsigned I = FirstVal - 1, J = 0; I < LastVal; ++I, ++J) {
+    // Assingment of the original sub-loop index to compute the logical index
+    // IV_k = LB_k + omp.fuse.index * ST_k
+    ExprResult IdxExpr =
+        SemaRef.BuildBinOp(CurScope, SourceLocation(), BO_Mul,
+                           MakeVarDeclRef(STVarDecls[J]), MakeIVRef());
+    if (!IdxExpr.isUsable())
+      return StmtError();
+    IdxExpr = SemaRef.BuildBinOp(CurScope, SourceLocation(), BO_Add,
+                                 MakeVarDeclRef(LBVarDecls[J]), IdxExpr.get());
+
+    if (!IdxExpr.isUsable())
+      return StmtError();
+    IdxExpr = SemaRef.BuildBinOp(CurScope, SourceLocation(), BO_Assign,
+                                 MakeVarDeclRef(IVVarDecls[J]), IdxExpr.get());
+    if (!IdxExpr.isUsable())
+      return StmtError();
+
+    // Update the original i_k = IV_k
+    SmallVector<Stmt *, 4> BodyStmts;
+    BodyStmts.push_back(IdxExpr.get());
+    llvm::append_range(BodyStmts, SeqAnalysis.Loops[I].HelperExprs.Updates);
+
+    // If the loop is a CXXForRangeStmt then the iterator variable is needed
+    if (auto *SourceCXXFor =
+            dyn_cast<CXXForRangeStmt>(SeqAnalysis.Loops[I].TheForStmt))
+      BodyStmts.push_back(SourceCXXFor->getLoopVarStmt());
+
+    Stmt *Body =
+        (isa<ForStmt>(SeqAnalysis.Loops[I].TheForStmt))
+            ? cast<ForStmt>(SeqAnalysis.Loops[I].TheForStmt)->getBody()
+            : cast<CXXForRangeStmt>(SeqAnalysis.Loops[I].TheForStmt)->getBody();
+    BodyStmts.push_back(Body);
+
+    CompoundStmt *CombinedBody =
+        CompoundStmt::Create(Context, BodyStmts, FPOptionsOverride(),
+                             SourceLocation(), SourceLocation());
+    ExprResult Condition =
+        SemaRef.BuildBinOp(CurScope, SourceLocation(), BO_LT, MakeIVRef(),
+                           MakeVarDeclRef(NIVarDecls[J]));
+
+    if (!Condition.isUsable())
+      return StmtError();
+
+    IfStmt *IfStatement = IfStmt::Create(
+        Context, SourceLocation(), IfStatementKind::Ordinary, nullptr, nullptr,
+        Condition.get(), SourceLocation(), SourceLocation(), CombinedBody,
+        SourceLocation(), nullptr);
+
+    FusedBodyStmts.push_back(IfStatement);
+  }
+  FusedBody = CompoundStmt::Create(Context, FusedBodyStmts, FPOptionsOverride(),
+                                   SourceLocation(), SourceLocation());
+
+  // 7. Construct the final fused loop
+  ForStmt *FusedForStmt = new (Context)
+      ForStmt(Context, InitStmt.get(), CondExpr.get(), nullptr, IncrExpr.get(),
+              FusedBody, InitStmt.get()->getBeginLoc(), SourceLocation(),
+              IncrExpr.get()->getEndLoc());
+
+  //  In the case of looprange, the result of fuse won't simply
+  //  be a single loop (ForStmt), but rather a loop sequence
+  //  (CompoundStmt) of 3 parts: the pre-fusion loops, the fused loop
+  //  and the post-fusion loops, preserving its original order.
+  //
+  //  Note: If looprange clause produces a single fused loop nest then
+  //  this compound statement wrapper is unnecessary (Therefore this
+  //  treatment is skipped)
+
+  Stmt *FusionStmt = FusedForStmt;
+  if (LRC && CountVal != SeqAnalysis.LoopSeqSize) {
+    SmallVector<Stmt *, 4> FinalLoops;
+
+    // Reset the transform index
+    TransformIndex = 0;
+
+    // Collect all non-fused loops before and after the fused region.
+    // Pre-fusion and post-fusion loops are inserted in order exploiting their
+    // symmetry, along with their corresponding transformation pre-inits if
+    // needed. The fused loop is added between the two regions.
+    for (unsigned I : llvm::seq<unsigned>(SeqAnalysis.LoopSeqSize)) {
+      if (I >= FirstVal - 1 && I < FirstVal + CountVal - 1) {
+        // Update the Transformation counter to skip already treated
+        // loop transformations
+        if (!SeqAnalysis.Loops[I].isLoopTransformation())
+          ++TransformIndex;
+        continue;
+      }
+
+      // No need to handle:
+      // Regular loops: they are kept intact as-is.
+      // Loop-sequence-generating transformations: already handled earlier.
+      // Only TransformSingleLoop requires inserting pre-inits here
+      if (SeqAnalysis.Loops[I].isRegularLoop()) {
+        const auto &TransformPreInit =
+            SeqAnalysis.Loops[TransformIndex++].TransformsPreInits;
+        if (!TransformPreInit.empty())
+          llvm::append_range(PreInits, TransformPreInit);
+      }
+
+      FinalLoops.push_back(SeqAnalysis.Loops[I].TheForStmt);
+    }
+
+    FinalLoops.insert(FinalLoops.begin() + (FirstVal - 1), FusedForStmt);
+    FusionStmt = CompoundStmt::Create(Context, FinalLoops, FPOptionsOverride(),
+                                      SourceLocation(), SourceLocation());
+  }
+  return OMPFuseDirective::Create(Context, StartLoc, EndLoc, Clauses,
+                                  NumGeneratedTopLevelLoops, AStmt, FusionStmt,
+                                  buildPreInits(Context, PreInits));
+}
+
 OMPClause *SemaOpenMP::ActOnOpenMPSingleExprClause(OpenMPClauseKind Kind,
                                                    Expr *Expr,
                                                    SourceLocation StartLoc,
@@ -16887,6 +17637,31 @@ OMPClause *SemaOpenMP::ActOnOpenMPPartialClause(Expr *FactorExpr,
                                   FactorExpr);
 }
 
+OMPClause *SemaOpenMP::ActOnOpenMPLoopRangeClause(
+    Expr *First, Expr *Count, SourceLocation StartLoc, SourceLocation LParenLoc,
+    SourceLocation FirstLoc, SourceLocation CountLoc, SourceLocation EndLoc) {
+
+  // OpenMP [6.0, Restrictions]
+  // First and Count must be integer expressions with positive value
+  ExprResult FirstVal =
+      VerifyPositiveIntegerConstantInClause(First, OMPC_looprange);
+  if (FirstVal.isInvalid())
+    First = nullptr;
+
+  ExprResult CountVal =
+      VerifyPositiveIntegerConstantInClause(Count, OMPC_looprange);
+  if (CountVal.isInvalid())
+    Count = nullptr;
+
+  // OpenMP [6.0, Restrictions]
+  // first + count - 1 must not evaluate to a value greater than the
+  // loop sequence length of the associated canonical loop sequence.
+  // This check must be performed afterwards due to the delayed
+  // parsing and computation of the associated loop sequence
+  return OMPLoopRangeClause::Create(getASTContext(), StartLoc, LParenLoc,
+                                    FirstLoc, CountLoc, EndLoc, First, Count);
+}
+
 OMPClause *SemaOpenMP::ActOnOpenMPAlignClause(Expr *A, SourceLocation StartLoc,
                                               SourceLocation LParenLoc,
                                               SourceLocation EndLoc) {
diff --git a/clang/lib/Sema/SemaTemplate.cpp b/clang/lib/Sema/SemaTemplate.cpp
index 3ebbb30..2bf1511 100644
--- a/clang/lib/Sema/SemaTemplate.cpp
+++ b/clang/lib/Sema/SemaTemplate.cpp
@@ -7102,7 +7102,7 @@ ExprResult Sema::CheckTemplateArgument(NamedDecl *Param, QualType ParamType,
 
   // If the parameter type somehow involves auto, deduce the type now.
   DeducedType *DeducedT = ParamType->getContainedDeducedType();
-  bool IsDeduced = DeducedT && !DeducedT->isDeduced();
+  bool IsDeduced = DeducedT && DeducedT->getDeducedType().isNull();
   if (IsDeduced) {
     // When checking a deduced template argument, deduce from its type even if
     // the type is dependent, in order to check the types of non-type template
diff --git a/clang/lib/Sema/SemaTemplateInstantiate.cpp b/clang/lib/Sema/SemaTemplateInstantiate.cpp
index 1ff94d7..f1c9c5c 100644
--- a/clang/lib/Sema/SemaTemplateInstantiate.cpp
+++ b/clang/lib/Sema/SemaTemplateInstantiate.cpp
@@ -616,29 +616,30 @@ Sema::InstantiatingTemplate::InstantiatingTemplate(
     Invalid = true;
     return;
   }
-  Invalid = CheckInstantiationDepth(PointOfInstantiation, InstantiationRange);
+
+  CodeSynthesisContext Inst;
+  Inst.Kind = Kind;
+  Inst.PointOfInstantiation = PointOfInstantiation;
+  Inst.Entity = Entity;
+  Inst.Template = Template;
+  Inst.TemplateArgs = TemplateArgs.data();
+  Inst.NumTemplateArgs = TemplateArgs.size();
+  Inst.DeductionInfo = DeductionInfo;
+  Inst.InstantiationRange = InstantiationRange;
+  Inst.InConstraintSubstitution =
+      Inst.Kind == CodeSynthesisContext::ConstraintSubstitution;
+  if (!SemaRef.CodeSynthesisContexts.empty())
+    Inst.InConstraintSubstitution |=
+        SemaRef.CodeSynthesisContexts.back().InConstraintSubstitution;
+
+  Invalid = SemaRef.pushCodeSynthesisContext(Inst);
   if (!Invalid) {
-    CodeSynthesisContext Inst;
-    Inst.Kind = Kind;
-    Inst.PointOfInstantiation = PointOfInstantiation;
-    Inst.Entity = Entity;
-    Inst.Template = Template;
-    Inst.TemplateArgs = TemplateArgs.data();
-    Inst.NumTemplateArgs = TemplateArgs.size();
-    Inst.DeductionInfo = DeductionInfo;
-    Inst.InstantiationRange = InstantiationRange;
-    Inst.InConstraintSubstitution =
-        Inst.Kind == CodeSynthesisContext::ConstraintSubstitution;
-    if (!SemaRef.CodeSynthesisContexts.empty())
-      Inst.InConstraintSubstitution |=
-          SemaRef.CodeSynthesisContexts.back().InConstraintSubstitution;
-
-    SemaRef.pushCodeSynthesisContext(Inst);
-
-    AlreadyInstantiating = !Inst.Entity ? false :
-        !SemaRef.InstantiatingSpecializations
-             .insert({Inst.Entity->getCanonicalDecl(), Inst.Kind})
-             .second;
+    AlreadyInstantiating =
+        !Inst.Entity
+            ? false
+            : !SemaRef.InstantiatingSpecializations
+                   .insert({Inst.Entity->getCanonicalDecl(), Inst.Kind})
+                   .second;
     atTemplateBegin(SemaRef.TemplateInstCallbacks, SemaRef, Inst);
   }
 }
@@ -834,18 +835,34 @@ Sema::InstantiatingTemplate::InstantiatingTemplate(
     : InstantiatingTemplate(SemaRef, CodeSynthesisContext::PartialOrderingTTP,
                             ArgLoc, InstantiationRange, PArg) {}
 
-void Sema::pushCodeSynthesisContext(CodeSynthesisContext Ctx) {
+bool Sema::pushCodeSynthesisContext(CodeSynthesisContext Ctx) {
   Ctx.SavedInNonInstantiationSFINAEContext = InNonInstantiationSFINAEContext;
   InNonInstantiationSFINAEContext = false;
 
-  CodeSynthesisContexts.push_back(Ctx);
-
-  if (!Ctx.isInstantiationRecord())
+  if (!Ctx.isInstantiationRecord()) {
     ++NonInstantiationEntries;
+  } else {
+    assert(SemaRef.NonInstantiationEntries <=
+           SemaRef.CodeSynthesisContexts.size());
+    if ((SemaRef.CodeSynthesisContexts.size() -
+         SemaRef.NonInstantiationEntries) >
+        SemaRef.getLangOpts().InstantiationDepth) {
+      SemaRef.Diag(Ctx.PointOfInstantiation,
+                   diag::err_template_recursion_depth_exceeded)
+          << SemaRef.getLangOpts().InstantiationDepth << Ctx.InstantiationRange;
+      SemaRef.Diag(Ctx.PointOfInstantiation,
+                   diag::note_template_recursion_depth)
+          << SemaRef.getLangOpts().InstantiationDepth;
+      return true;
+    }
+  }
+
+  CodeSynthesisContexts.push_back(Ctx);
 
   // Check to see if we're low on stack space. We can't do anything about this
   // from here, but we can at least warn the user.
   StackHandler.warnOnStackNearlyExhausted(Ctx.PointOfInstantiation);
+  return false;
 }
 
 void Sema::popCodeSynthesisContext() {
@@ -907,25 +924,6 @@ static std::string convertCallArgsToString(Sema &S,
   return Result;
 }
 
-bool Sema::InstantiatingTemplate::CheckInstantiationDepth(
-                                        SourceLocation PointOfInstantiation,
-                                           SourceRange InstantiationRange) {
-  assert(SemaRef.NonInstantiationEntries <=
-         SemaRef.CodeSynthesisContexts.size());
-  if ((SemaRef.CodeSynthesisContexts.size() -
-          SemaRef.NonInstantiationEntries)
-        <= SemaRef.getLangOpts().InstantiationDepth)
-    return false;
-
-  SemaRef.Diag(PointOfInstantiation,
-               diag::err_template_recursion_depth_exceeded)
-    << SemaRef.getLangOpts().InstantiationDepth
-    << InstantiationRange;
-  SemaRef.Diag(PointOfInstantiation, diag::note_template_recursion_depth)
-    << SemaRef.getLangOpts().InstantiationDepth;
-  return true;
-}
-
 void Sema::PrintInstantiationStack(InstantiationContextDiagFuncRef DiagFunc) {
   // Determine which template instantiations to skip, if any.
   unsigned SkipStart = CodeSynthesisContexts.size(), SkipEnd = SkipStart;
diff --git a/clang/lib/Sema/SemaTypeTraits.cpp b/clang/lib/Sema/SemaTypeTraits.cpp
index c2427dcf..6c798d6 100644
--- a/clang/lib/Sema/SemaTypeTraits.cpp
+++ b/clang/lib/Sema/SemaTypeTraits.cpp
@@ -1163,13 +1163,16 @@ static bool EvaluateUnaryTypeTrait(Sema &Self, TypeTrait UTT,
     //   - it has at least one trivial eligible constructor and a trivial,
     //     non-deleted destructor.
     const CXXDestructorDecl *Dtor = RD->getDestructor();
-    if (UnqualT->isAggregateType())
-      if (Dtor && !Dtor->isUserProvided())
-        return true;
-    if (RD->hasTrivialDestructor() && (!Dtor || !Dtor->isDeleted()))
-      if (RD->hasTrivialDefaultConstructor() ||
-          RD->hasTrivialCopyConstructor() || RD->hasTrivialMoveConstructor())
-        return true;
+    if (UnqualT->isAggregateType() && (!Dtor || !Dtor->isUserProvided()))
+      return true;
+    if (RD->hasTrivialDestructor() && (!Dtor || !Dtor->isDeleted())) {
+      for (CXXConstructorDecl *Ctr : RD->ctors()) {
+        if (Ctr->isIneligibleOrNotSelected() || Ctr->isDeleted())
+          continue;
+        if (Ctr->isTrivial())
+          return true;
+      }
+    }
     return false;
   }
   case UTT_IsIntangibleType:
diff --git a/clang/lib/Sema/TreeTransform.h b/clang/lib/Sema/TreeTransform.h
index 0214078..6967301 100644
--- a/clang/lib/Sema/TreeTransform.h
+++ b/clang/lib/Sema/TreeTransform.h
@@ -1783,6 +1783,14 @@ public:
                                                        LParenLoc, EndLoc);
   }
 
+  OMPClause *
+  RebuildOMPLoopRangeClause(Expr *First, Expr *Count, SourceLocation StartLoc,
+                            SourceLocation LParenLoc, SourceLocation FirstLoc,
+                            SourceLocation CountLoc, SourceLocation EndLoc) {
+    return getSema().OpenMP().ActOnOpenMPLoopRangeClause(
+        First, Count, StartLoc, LParenLoc, FirstLoc, CountLoc, EndLoc);
+  }
+
   /// Build a new OpenMP 'allocator' clause.
   ///
   /// By default, performs semantic analysis to build the new OpenMP clause.
@@ -9609,6 +9617,17 @@ StmtResult TreeTransform<Derived>::TransformOMPInterchangeDirective(
 
 template <typename Derived>
 StmtResult
+TreeTransform<Derived>::TransformOMPFuseDirective(OMPFuseDirective *D) {
+  DeclarationNameInfo DirName;
+  getDerived().getSema().OpenMP().StartOpenMPDSABlock(
+      D->getDirectiveKind(), DirName, nullptr, D->getBeginLoc());
+  StmtResult Res = getDerived().TransformOMPExecutableDirective(D);
+  getDerived().getSema().OpenMP().EndOpenMPDSABlock(Res.get());
+  return Res;
+}
+
+template <typename Derived>
+StmtResult
 TreeTransform<Derived>::TransformOMPForDirective(OMPForDirective *D) {
   DeclarationNameInfo DirName;
   getDerived().getSema().OpenMP().StartOpenMPDSABlock(
@@ -10502,6 +10521,31 @@ TreeTransform<Derived>::TransformOMPPartialClause(OMPPartialClause *C) {
 
 template <typename Derived>
 OMPClause *
+TreeTransform<Derived>::TransformOMPLoopRangeClause(OMPLoopRangeClause *C) {
+  ExprResult F = getDerived().TransformExpr(C->getFirst());
+  if (F.isInvalid())
+    return nullptr;
+
+  ExprResult Cn = getDerived().TransformExpr(C->getCount());
+  if (Cn.isInvalid())
+    return nullptr;
+
+  Expr *First = F.get();
+  Expr *Count = Cn.get();
+
+  bool Changed = (First != C->getFirst()) || (Count != C->getCount());
+
+  // If no changes and AlwaysRebuild() is false, return the original clause
+  if (!Changed && !getDerived().AlwaysRebuild())
+    return C;
+
+  return RebuildOMPLoopRangeClause(First, Count, C->getBeginLoc(),
+                                   C->getLParenLoc(), C->getFirstLoc(),
+                                   C->getCountLoc(), C->getEndLoc());
+}
+
+template <typename Derived>
+OMPClause *
 TreeTransform<Derived>::TransformOMPCollapseClause(OMPCollapseClause *C) {
   ExprResult E = getDerived().TransformExpr(C->getNumForLoops());
   if (E.isInvalid())
diff --git a/clang/lib/Serialization/ASTReader.cpp b/clang/lib/Serialization/ASTReader.cpp
index 9ee8a0f..c05e428 100644
--- a/clang/lib/Serialization/ASTReader.cpp
+++ b/clang/lib/Serialization/ASTReader.cpp
@@ -11215,6 +11215,9 @@ OMPClause *OMPClauseReader::readClause() {
   case llvm::omp::OMPC_partial:
     C = OMPPartialClause::CreateEmpty(Context);
     break;
+  case llvm::omp::OMPC_looprange:
+    C = OMPLoopRangeClause::CreateEmpty(Context);
+    break;
   case llvm::omp::OMPC_allocator:
     C = new (Context) OMPAllocatorClause();
     break;
@@ -11618,6 +11621,14 @@ void OMPClauseReader::VisitOMPPartialClause(OMPPartialClause *C) {
   C->setLParenLoc(Record.readSourceLocation());
 }
 
+void OMPClauseReader::VisitOMPLoopRangeClause(OMPLoopRangeClause *C) {
+  C->setFirst(Record.readSubExpr());
+  C->setCount(Record.readSubExpr());
+  C->setLParenLoc(Record.readSourceLocation());
+  C->setFirstLoc(Record.readSourceLocation());
+  C->setCountLoc(Record.readSourceLocation());
+}
+
 void OMPClauseReader::VisitOMPAllocatorClause(OMPAllocatorClause *C) {
   C->setAllocator(Record.readExpr());
   C->setLParenLoc(Record.readSourceLocation());
diff --git a/clang/lib/Serialization/ASTReaderStmt.cpp b/clang/lib/Serialization/ASTReaderStmt.cpp
index 213c2c2..70b898a 100644
--- a/clang/lib/Serialization/ASTReaderStmt.cpp
+++ b/clang/lib/Serialization/ASTReaderStmt.cpp
@@ -2469,10 +2469,21 @@ void ASTStmtReader::VisitOMPReverseDirective(OMPReverseDirective *D) {
   VisitOMPCanonicalLoopNestTransformationDirective(D);
 }
 
+void ASTStmtReader::VisitOMPCanonicalLoopSequenceTransformationDirective(
+    OMPCanonicalLoopSequenceTransformationDirective *D) {
+  VisitStmt(D);
+  VisitOMPExecutableDirective(D);
+  D->setNumGeneratedTopLevelLoops(Record.readUInt32());
+}
+
 void ASTStmtReader::VisitOMPInterchangeDirective(OMPInterchangeDirective *D) {
   VisitOMPCanonicalLoopNestTransformationDirective(D);
 }
 
+void ASTStmtReader::VisitOMPFuseDirective(OMPFuseDirective *D) {
+  VisitOMPCanonicalLoopSequenceTransformationDirective(D);
+}
+
 void ASTStmtReader::VisitOMPForDirective(OMPForDirective *D) {
   VisitOMPLoopDirective(D);
   D->setHasCancel(Record.readBool());
@@ -3615,6 +3626,12 @@ Stmt *ASTReader::ReadStmtFromStream(ModuleFile &F) {
       break;
     }
 
+    case STMT_OMP_FUSE_DIRECTIVE: {
+      unsigned NumClauses = Record[ASTStmtReader::NumStmtFields];
+      S = OMPFuseDirective::CreateEmpty(Context, NumClauses);
+      break;
+    }
+
     case STMT_OMP_INTERCHANGE_DIRECTIVE: {
       unsigned NumLoops = Record[ASTStmtReader::NumStmtFields];
       unsigned NumClauses = Record[ASTStmtReader::NumStmtFields + 1];
diff --git a/clang/lib/Serialization/ASTWriter.cpp b/clang/lib/Serialization/ASTWriter.cpp
index 09859da..cdf95ba 100644
--- a/clang/lib/Serialization/ASTWriter.cpp
+++ b/clang/lib/Serialization/ASTWriter.cpp
@@ -7882,6 +7882,14 @@ void OMPClauseWriter::VisitOMPPartialClause(OMPPartialClause *C) {
   Record.AddSourceLocation(C->getLParenLoc());
 }
 
+void OMPClauseWriter::VisitOMPLoopRangeClause(OMPLoopRangeClause *C) {
+  Record.AddStmt(C->getFirst());
+  Record.AddStmt(C->getCount());
+  Record.AddSourceLocation(C->getLParenLoc());
+  Record.AddSourceLocation(C->getFirstLoc());
+  Record.AddSourceLocation(C->getCountLoc());
+}
+
 void OMPClauseWriter::VisitOMPAllocatorClause(OMPAllocatorClause *C) {
   Record.AddStmt(C->getAllocator());
   Record.AddSourceLocation(C->getLParenLoc());
diff --git a/clang/lib/Serialization/ASTWriterStmt.cpp b/clang/lib/Serialization/ASTWriterStmt.cpp
index 21c04dd..ebda91e 100644
--- a/clang/lib/Serialization/ASTWriterStmt.cpp
+++ b/clang/lib/Serialization/ASTWriterStmt.cpp
@@ -2487,6 +2487,18 @@ void ASTStmtWriter::VisitOMPInterchangeDirective(OMPInterchangeDirective *D) {
   Code = serialization::STMT_OMP_INTERCHANGE_DIRECTIVE;
 }
 
+void ASTStmtWriter::VisitOMPCanonicalLoopSequenceTransformationDirective(
+    OMPCanonicalLoopSequenceTransformationDirective *D) {
+  VisitStmt(D);
+  VisitOMPExecutableDirective(D);
+  Record.writeUInt32(D->getNumGeneratedTopLevelLoops());
+}
+
+void ASTStmtWriter::VisitOMPFuseDirective(OMPFuseDirective *D) {
+  VisitOMPCanonicalLoopSequenceTransformationDirective(D);
+  Code = serialization::STMT_OMP_FUSE_DIRECTIVE;
+}
+
 void ASTStmtWriter::VisitOMPForDirective(OMPForDirective *D) {
   VisitOMPLoopDirective(D);
   Record.writeBool(D->hasCancel());
diff --git a/clang/lib/StaticAnalyzer/Checkers/CStringChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/CStringChecker.cpp
index 36f316d..0ae784c 100644
--- a/clang/lib/StaticAnalyzer/Checkers/CStringChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/CStringChecker.cpp
@@ -672,6 +672,10 @@ ProgramStateRef CStringChecker::CheckOverlap(CheckerContext &C,
 
   ProgramStateRef stateTrue, stateFalse;
 
+  if (!First.Expression->getType()->isAnyPointerType() ||
+      !Second.Expression->getType()->isAnyPointerType())
+    return state;
+
   // Assume different address spaces cannot overlap.
   if (First.Expression->getType()->getPointeeType().getAddressSpace() !=
       Second.Expression->getType()->getPointeeType().getAddressSpace())
diff --git a/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp b/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp
index 785cdfa..4e472b7 100644
--- a/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp
+++ b/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp
@@ -1814,6 +1814,7 @@ void ExprEngine::Visit(const Stmt *S, ExplodedNode *Pred,
     case Stmt::OMPStripeDirectiveClass:
     case Stmt::OMPTileDirectiveClass:
     case Stmt::OMPInterchangeDirectiveClass:
+    case Stmt::OMPFuseDirectiveClass:
     case Stmt::OMPInteropDirectiveClass:
     case Stmt::OMPDispatchDirectiveClass:
     case Stmt::OMPMaskedDirectiveClass:
diff --git a/clang/test/AST/ByteCode/cxx23.cpp b/clang/test/AST/ByteCode/cxx23.cpp
index 72c751d..ce0a4777 100644
--- a/clang/test/AST/ByteCode/cxx23.cpp
+++ b/clang/test/AST/ByteCode/cxx23.cpp
@@ -1,8 +1,8 @@
 // UNSUPPORTED:  target={{.*}}-zos{{.*}}
-// RUN: %clang_cc1 -std=c++20 -fsyntax-only -fcxx-exceptions -verify=ref,ref20,all,all20 %s
-// RUN: %clang_cc1 -std=c++23 -fsyntax-only -fcxx-exceptions -verify=ref,ref23,all,all23 %s
-// RUN: %clang_cc1 -std=c++20 -fsyntax-only -fcxx-exceptions -verify=expected20,all,all20 %s -fexperimental-new-constant-interpreter
-// RUN: %clang_cc1 -std=c++23 -fsyntax-only -fcxx-exceptions -verify=expected23,all,all23 %s -fexperimental-new-constant-interpreter
+// RUN: %clang_cc1 -std=c++20 -fsyntax-only -fcxx-exceptions -Wno-deprecated-volatile -verify=ref,ref20,all,all20 %s
+// RUN: %clang_cc1 -std=c++23 -fsyntax-only -fcxx-exceptions -Wno-deprecated-volatile -verify=ref,ref23,all,all23 %s
+// RUN: %clang_cc1 -std=c++20 -fsyntax-only -fcxx-exceptions -Wno-deprecated-volatile -verify=expected20,all,all20 %s -fexperimental-new-constant-interpreter
+// RUN: %clang_cc1 -std=c++23 -fsyntax-only -fcxx-exceptions -Wno-deprecated-volatile -verify=expected23,all,all23 %s -fexperimental-new-constant-interpreter
 
 
 #define assert_active(F)   if (!__builtin_is_within_lifetime(&F)) (1/0);
@@ -393,6 +393,59 @@ namespace UnionMemberCallDiags {
   static_assert(g()); // all-error {{not an integral constant expression}} \
                       // all-note {{in call to}}
 }
+#endif
+
+namespace VolatileWrites {
+  constexpr void test1() {// all20-error {{never produces a constant expression}}
+    int k;
+    volatile int &m = k;
+    m = 10; // all20-note {{assignment to volatile-qualified type 'volatile int'}}
+  }
 
+  constexpr void test2() { // all20-error {{never produces a constant expression}}
+    volatile int k = 12;
 
+    k = 13; // all20-note {{assignment to volatile-qualified type 'volatile int'}}
+  }
+
+  constexpr void test3() { // all20-error {{never produces a constant expression}}
+    volatile int k = 12; // all20-note {{volatile object declared here}}
+
+    *((int *)&k) = 13; // all20-note {{assignment to volatile object 'k' is not allowed in a constant expression}}
+  }
+
+  constexpr void test4() { // all20-error {{never produces a constant expression}}
+    int k = 12;
+
+    *((volatile int *)&k) = 13; // all20-note {{assignment to volatile-qualified type 'volatile int' is not allowed in a constant expression}}
+  }
+
+#if __cplusplus >= 202302L
+  struct S {
+    volatile int k;
+  };
+  constexpr int test5() {
+    S s;
+    s.k = 12; // all-note {{assignment to volatile-qualified type 'volatile int' is not}}
+
+    return 0;
+  }
+  static_assert(test5() == 0); // all-error{{not an integral constant expression}} \
+                               // all-note {{in call to}}
 #endif
+
+  constexpr bool test6(volatile int k) { // ref20-error {{never produces a constant expression}}
+    k = 14; // ref20-note {{assignment to volatile-qualified type 'volatile int' is not}} \
+            // all-note {{assignment to volatile-qualified type 'volatile int' is not}}
+    return true;
+  }
+  static_assert(test6(5)); // all-error {{not an integral constant expression}} \
+                           // all-note {{in call to}}
+
+  constexpr bool test7(volatile int k) { // all-note {{declared here}}
+    *((int *)&k) = 13; // all-note {{assignment to volatile object 'k' is not allowed in a constant expression}}
+    return true;
+  }
+  static_assert(test7(12)); // all-error {{not an integral constant expression}} \
+                            // all-note {{in call to}}
+}
diff --git a/clang/test/AST/ByteCode/invalid.cpp b/clang/test/AST/ByteCode/invalid.cpp
index affb40ea..00db274 100644
--- a/clang/test/AST/ByteCode/invalid.cpp
+++ b/clang/test/AST/ByteCode/invalid.cpp
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -fcxx-exceptions -std=c++20 -fexperimental-new-constant-interpreter -verify=expected,both %s
-// RUN: %clang_cc1 -fcxx-exceptions -std=c++20 -verify=ref,both %s
+// RUN: %clang_cc1 -fcxx-exceptions -std=c++20                                         -verify=ref,both %s
 
 namespace Throw {
 
diff --git a/clang/test/Analysis/buffer-overlap-decls.c b/clang/test/Analysis/buffer-overlap-decls.c
new file mode 100644
index 0000000..4830f4e
--- /dev/null
+++ b/clang/test/Analysis/buffer-overlap-decls.c
@@ -0,0 +1,23 @@
+// RUN: %clang_analyze_cc1 -verify %s -Wno-incompatible-library-redeclaration \
+// RUN:   -analyzer-checker=alpha.unix.cstring.BufferOverlap
+// expected-no-diagnostics
+
+typedef typeof(sizeof(int)) size_t;
+
+void memcpy(int dst, int src, size_t size);
+
+void test_memcpy_proxy() {
+  memcpy(42, 42, 42); // no-crash
+}
+
+void strcpy(int dst, char *src);
+
+void test_strcpy_proxy() {
+  strcpy(42, (char *)42); // no-crash
+}
+
+void strxfrm(int dst, char *src, size_t size);
+
+void test_strxfrm_proxy() {
+  strxfrm(42, (char *)42, 42); // no-crash
+}
diff --git a/clang/test/Analysis/buffer-overlap.c b/clang/test/Analysis/buffer-overlap.c
index 8414a76..defb17a 100644
--- a/clang/test/Analysis/buffer-overlap.c
+++ b/clang/test/Analysis/buffer-overlap.c
@@ -96,3 +96,10 @@ void test_snprintf6() {
   char b[4] = {0};
   snprintf(a, sizeof(a), "%s", b); // no-warning
 }
+
+void* memcpy(void* dest, const void* src, size_t count);
+
+void test_memcpy_esoteric() {
+label:
+  memcpy((char *)&&label, (const char *)memcpy, 1);
+}
diff --git a/clang/test/CIR/CodeGen/complex.cpp b/clang/test/CIR/CodeGen/complex.cpp
index e901631..4c396d3 100644
--- a/clang/test/CIR/CodeGen/complex.cpp
+++ b/clang/test/CIR/CodeGen/complex.cpp
@@ -1270,3 +1270,40 @@ void real_on_scalar_from_real_with_type_promotion() {
 // OGCG: %[[A_REAL_F32:.*]] = fpext half %[[A_REAL]] to float
 // OGCG: %[[A_REAL_F16:.*]] = fptrunc float %[[A_REAL_F32]] to half
 // OGCG: store half %[[A_REAL_F16]], ptr %[[B_ADDR]], align 2
+
+void real_on_scalar_from_imag_with_type_promotion() {
+  _Float16 _Complex a;
+  _Float16 b = __real__(__imag__ a);
+}
+
+// CIR: %[[A_ADDR:.*]] = cir.alloca !cir.complex<!cir.f16>, !cir.ptr<!cir.complex<!cir.f16>>, ["a"]
+// CIR: %[[B_ADDR:.*]] = cir.alloca !cir.f16, !cir.ptr<!cir.f16>, ["b", init]
+// CIR: %[[TMP_A:.*]] = cir.load{{.*}} %[[A_ADDR]] : !cir.ptr<!cir.complex<!cir.f16>>, !cir.complex<!cir.f16>
+// CIR: %[[A_REAL:.*]] = cir.complex.real %[[TMP_A]] : !cir.complex<!cir.f16> -> !cir.f16
+// CIR: %[[A_IMAG:.*]] = cir.complex.imag %[[TMP_A]] : !cir.complex<!cir.f16> -> !cir.f16
+// CIR: %[[A_REAL_F32:.*]] = cir.cast(floating, %[[A_REAL]] : !cir.f16), !cir.float
+// CIR: %[[A_IMAG_F32:.*]] = cir.cast(floating, %[[A_IMAG]] : !cir.f16), !cir.float
+// CIR: %[[A_COMPLEX_F32:.*]] = cir.complex.create %[[A_REAL_F32]], %[[A_IMAG_F32]] : !cir.float -> !cir.complex<!cir.float>
+// CIR: %[[A_IMAG_F32:.*]] = cir.complex.imag %[[A_COMPLEX_F32]] : !cir.complex<!cir.float> -> !cir.float
+// CIR: %[[A_IMAG_F16:.*]] = cir.cast(floating, %[[A_IMAG_F32]] : !cir.float), !cir.f16
+// CIR: cir.store{{.*}} %[[A_IMAG_F16]], %[[B_ADDR]] : !cir.f16, !cir.ptr<!cir.f16>
+
+// LLVM: %[[A_ADDR:.*]] = alloca { half, half }, i64 1, align 2
+// LLVM: %[[B_ADDR]] = alloca half, i64 1, align 2
+// LLVM: %[[TMP_A:.*]] = load { half, half }, ptr %[[A_ADDR]], align 2
+// LLVM: %[[A_REAL:.*]] = extractvalue { half, half } %[[TMP_A]], 0
+// LLVM: %[[A_IMAG:.*]] = extractvalue { half, half } %[[TMP_A]], 1
+// LLVM: %[[A_REAL_F32:.*]] = fpext half %[[A_REAL]] to float
+// LLVM: %[[A_IMAG_F32:.*]] = fpext half %[[A_IMAG]] to float
+// LLVM: %[[TMP_A_COMPLEX_F32:.*]] = insertvalue { float, float } {{.*}}, float %[[A_REAL_F32]], 0
+// LLVM: %[[A_COMPLEX_F32:.*]] = insertvalue { float, float } %[[TMP_A_COMPLEX_F32]], float %[[A_IMAG_F32]], 1
+// LLVM: %[[A_IMAG_F16:.*]] = fptrunc float %[[A_IMAG_F32]] to half
+// LLVM: store half %[[A_IMAG_F16]], ptr %[[B_ADDR]], align 2
+
+// OGCG: %[[A_ADDR:.*]] = alloca { half, half }, align 2
+// OGCG: %[[B_ADDR:.*]] = alloca half, align 2
+// OGCG: %[[A_IMAG_PTR:.*]] = getelementptr inbounds nuw { half, half }, ptr %[[A_ADDR]], i32 0, i32 1
+// OGCG: %[[A_IMAG:.*]] = load half, ptr %[[A_IMAG_PTR]], align 2
+// OGCG: %[[A_IMAG_F32:.*]] = fpext half %[[A_IMAG]] to float
+// OGCG: %[[A_IMAG_F16:.*]] = fptrunc float %[[A_IMAG_F32]] to half
+// OGCG: store half %[[A_IMAG_F16]], ptr %[[B_ADDR]], align 2
diff --git a/clang/test/CIR/CodeGen/delete.cpp b/clang/test/CIR/CodeGen/delete.cpp
new file mode 100644
index 0000000..f21d203
--- /dev/null
+++ b/clang/test/CIR/CodeGen/delete.cpp
@@ -0,0 +1,88 @@
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -std=c++20 -fclangir -mconstructor-aliases -emit-cir %s -o %t.cir
+// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -std=c++20 -fclangir -mconstructor-aliases -emit-llvm %s -o %t-cir.ll
+// RUN: FileCheck --check-prefix=LLVM --input-file=%t-cir.ll %s
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -std=c++20 -mconstructor-aliases -emit-llvm %s -o %t.ll
+// RUN: FileCheck --check-prefix=OGCG --input-file=%t.ll %s
+
+typedef __typeof(sizeof(int)) size_t;
+
+struct SizedDelete {
+  void operator delete(void*, size_t);
+  int member;
+};
+void test_sized_delete(SizedDelete *x) {
+  delete x;
+}
+
+// SizedDelete::operator delete(void*, unsigned long)
+// CIR:  cir.func private @_ZN11SizedDeletedlEPvm(!cir.ptr<!void>, !u64i)
+// LLVM: declare void @_ZN11SizedDeletedlEPvm(ptr, i64)
+
+// CIR: cir.func dso_local @_Z17test_sized_deleteP11SizedDelete
+// CIR:   %[[X:.*]] = cir.load{{.*}} %{{.*}}
+// CIR:   %[[X_CAST:.*]] = cir.cast(bitcast, %[[X]] : !cir.ptr<!rec_SizedDelete>), !cir.ptr<!void>
+// CIR:   %[[OBJ_SIZE:.*]] = cir.const #cir.int<4> : !u64i
+// CIR:   cir.call @_ZN11SizedDeletedlEPvm(%[[X_CAST]], %[[OBJ_SIZE]]) nothrow : (!cir.ptr<!void>, !u64i) -> ()
+
+// LLVM: define dso_local void @_Z17test_sized_deleteP11SizedDelete
+// LLVM:   %[[X:.*]] = load ptr, ptr %{{.*}}
+// LLVM:   call void @_ZN11SizedDeletedlEPvm(ptr %[[X]], i64 4)
+
+// OGCG: define dso_local void @_Z17test_sized_deleteP11SizedDelete
+// OGCG:   %[[X:.*]] = load ptr, ptr %{{.*}}
+// OGCG:   %[[ISNULL:.*]] = icmp eq ptr %[[X]], null
+// OGCG:   br i1 %[[ISNULL]], label %{{.*}}, label %[[DELETE_NOTNULL:.*]]
+// OGCG: [[DELETE_NOTNULL]]:
+// OGCG:   call void @_ZN11SizedDeletedlEPvm(ptr noundef %[[X]], i64 noundef 4)
+
+// This function is declared below the call in OGCG.
+// OGCG: declare void @_ZN11SizedDeletedlEPvm(ptr noundef, i64 noundef)
+
+struct Contents {
+  ~Contents() {}
+};
+struct Container {
+  Contents *contents;
+  ~Container();
+};
+Container::~Container() { delete contents; }
+
+// Contents::~Contents()
+// CIR: cir.func comdat linkonce_odr @_ZN8ContentsD2Ev
+// LLVM: define linkonce_odr void @_ZN8ContentsD2Ev
+
+// operator delete(void*, unsigned long)
+// CIR: cir.func private @_ZdlPvm(!cir.ptr<!void>, !u64i)
+// LLVM: declare void @_ZdlPvm(ptr, i64)
+
+// Container::~Container()
+// CIR: cir.func dso_local @_ZN9ContainerD2Ev
+// CIR:   %[[THIS:.*]] = cir.load %{{.*}}
+// CIR:   %[[CONTENTS_PTR_ADDR:.*]] = cir.get_member %[[THIS]][0] {name = "contents"} : !cir.ptr<!rec_Container> -> !cir.ptr<!cir.ptr<!rec_Contents>>
+// CIR:   %[[CONTENTS_PTR:.*]] = cir.load{{.*}} %[[CONTENTS_PTR_ADDR]]
+// CIR:   cir.call @_ZN8ContentsD2Ev(%[[CONTENTS_PTR]]) nothrow : (!cir.ptr<!rec_Contents>) -> ()
+// CIR:   %[[CONTENTS_CAST:.*]] = cir.cast(bitcast, %[[CONTENTS_PTR]] : !cir.ptr<!rec_Contents>), !cir.ptr<!void>
+// CIR:   %[[OBJ_SIZE:.*]] = cir.const #cir.int<1> : !u64i
+// CIR:   cir.call @_ZdlPvm(%[[CONTENTS_CAST]], %[[OBJ_SIZE]]) nothrow : (!cir.ptr<!void>, !u64i) -> ()
+
+// LLVM: define dso_local void @_ZN9ContainerD2Ev
+// LLVM:   %[[THIS:.*]] = load ptr, ptr %{{.*}}
+// LLVM:   %[[CONTENTS_PTR_ADDR:.*]] = getelementptr %struct.Container, ptr %[[THIS]], i32 0, i32 0
+// LLVM:   %[[CONTENTS_PTR:.*]] = load ptr, ptr %[[CONTENTS_PTR_ADDR]]
+// LLVM:   call void @_ZN8ContentsD2Ev(ptr %[[CONTENTS_PTR]])
+// LLVM:   call void @_ZdlPvm(ptr %[[CONTENTS_PTR]], i64 1)
+
+// OGCG: define dso_local void @_ZN9ContainerD2Ev
+// OGCG:   %[[THIS:.*]] = load ptr, ptr %{{.*}}
+// OGCG:   %[[CONTENTS:.*]] = getelementptr inbounds nuw %struct.Container, ptr %[[THIS]], i32 0, i32 0
+// OGCG:   %[[CONTENTS_PTR:.*]] = load ptr, ptr %[[CONTENTS]]
+// OGCG:   %[[ISNULL:.*]] = icmp eq ptr %[[CONTENTS_PTR]], null
+// OGCG:   br i1 %[[ISNULL]], label %{{.*}}, label %[[DELETE_NOTNULL:.*]]
+// OGCG: [[DELETE_NOTNULL]]:
+// OGCG:   call void @_ZN8ContentsD2Ev(ptr noundef nonnull align 1 dereferenceable(1) %[[CONTENTS_PTR]])
+// OGCG:   call void @_ZdlPvm(ptr noundef %[[CONTENTS_PTR]], i64 noundef 1)
+
+// These functions are declared/defined below the calls in OGCG.
+// OGCG: define linkonce_odr void @_ZN8ContentsD2Ev
+// OGCG: declare void @_ZdlPvm(ptr noundef, i64 noundef)
diff --git a/clang/test/CIR/CodeGen/lang-c-cpp.cpp b/clang/test/CIR/CodeGen/lang-c-cpp.cpp
index e126932..8931783 100644
--- a/clang/test/CIR/CodeGen/lang-c-cpp.cpp
+++ b/clang/test/CIR/CodeGen/lang-c-cpp.cpp
@@ -3,8 +3,8 @@
 // RUN: %clang_cc1 -x c -triple x86_64-unknown-linux-gnu -fclangir -emit-cir %s -o %t.c.cir
 // RUN: FileCheck --check-prefix=CIR-C --input-file=%t.c.cir %s
 
-// CIR-CPP: module attributes {{{.*}}cir.lang = #cir.lang<cxx>{{.*}}}
-// CIR-C: module attributes {{{.*}}cir.lang = #cir.lang<c>{{.*}}}
+// CIR-CPP: module{{.*}} attributes {{{.*}}cir.lang = #cir.lang<cxx>{{.*}}}
+// CIR-C: module{{.*}} attributes {{{.*}}cir.lang = #cir.lang<c>{{.*}}}
 
 int main() {
   return 0;
diff --git a/clang/test/CIR/CodeGen/module-filename.cpp b/clang/test/CIR/CodeGen/module-filename.cpp
new file mode 100644
index 0000000..05e2e92
--- /dev/null
+++ b/clang/test/CIR/CodeGen/module-filename.cpp
@@ -0,0 +1,11 @@
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir -emit-cir %s -o %t.cir
+// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s
+
+// Normally, we try to avoid checking the filename of a test, but that's the
+// entire point of this test, so we use a wildcard for the path but check the
+// filename.
+// CIR: module @"{{.*}}module-filename.cpp"
+
+int main() {
+  return 0;
+}
diff --git a/clang/test/CIR/CodeGen/opt-info-attr.cpp b/clang/test/CIR/CodeGen/opt-info-attr.cpp
index 444286b..97071d7 100644
--- a/clang/test/CIR/CodeGen/opt-info-attr.cpp
+++ b/clang/test/CIR/CodeGen/opt-info-attr.cpp
@@ -13,10 +13,10 @@
 
 void f() {}
 
-// CHECK-O0: module attributes
+// CHECK-O0: module{{.*}} attributes
 // CHECK-O0-NOT: cir.opt_info
-// CHECK-O1: module attributes {{.+}}cir.opt_info = #cir.opt_info<level = 1, size = 0>{{.+}}
-// CHECK-O2: module attributes {{.+}}cir.opt_info = #cir.opt_info<level = 2, size = 0>{{.+}}
-// CHECK-O3: module attributes {{.+}}cir.opt_info = #cir.opt_info<level = 3, size = 0>{{.+}}
-// CHECK-Os: module attributes {{.+}}cir.opt_info = #cir.opt_info<level = 2, size = 1>{{.+}}
-// CHECK-Oz: module attributes {{.+}}cir.opt_info = #cir.opt_info<level = 2, size = 2>{{.+}}
+// CHECK-O1: module{{.*}} attributes {{.+}}cir.opt_info = #cir.opt_info<level = 1, size = 0>{{.+}}
+// CHECK-O2: module{{.*}} attributes {{.+}}cir.opt_info = #cir.opt_info<level = 2, size = 0>{{.+}}
+// CHECK-O3: module{{.*}} attributes {{.+}}cir.opt_info = #cir.opt_info<level = 3, size = 0>{{.+}}
+// CHECK-Os: module{{.*}} attributes {{.+}}cir.opt_info = #cir.opt_info<level = 2, size = 1>{{.+}}
+// CHECK-Oz: module{{.*}} attributes {{.+}}cir.opt_info = #cir.opt_info<level = 2, size = 2>{{.+}}
diff --git a/clang/test/CIR/CodeGen/vbase.cpp b/clang/test/CIR/CodeGen/vbase.cpp
index 9139651..4d57f8e 100644
--- a/clang/test/CIR/CodeGen/vbase.cpp
+++ b/clang/test/CIR/CodeGen/vbase.cpp
@@ -13,19 +13,29 @@ public:
 
 class Derived : public virtual Base {};
 
-// This is just here to force the record types to be emitted.
 void f() {
   Derived d;
+  d.f();
+}
+
+class DerivedFinal final : public virtual Base {};
+
+void g() {
+  DerivedFinal df;
+  df.f();
 }
 
 // CIR: !rec_Base = !cir.record<class "Base" {!cir.vptr}>
 // CIR: !rec_Derived = !cir.record<class "Derived" {!rec_Base}>
+// CIR: !rec_DerivedFinal = !cir.record<class "DerivedFinal" {!rec_Base}>
 
 // LLVM: %class.Derived = type { %class.Base }
 // LLVM: %class.Base = type { ptr }
+// LLVM: %class.DerivedFinal = type { %class.Base }
 
 // OGCG: %class.Derived = type { %class.Base }
 // OGCG: %class.Base = type { ptr }
+// OGCG: %class.DerivedFinal = type { %class.Base }
 
 // Test the constructor handling for a class with a virtual base.
 struct A {
@@ -47,6 +57,76 @@ void ppp() { B b; }
 
 // OGCG: @_ZTV1B = linkonce_odr unnamed_addr constant { [3 x ptr] } { [3 x ptr] [ptr inttoptr (i64 12 to ptr), ptr null, ptr @_ZTI1B] }, comdat, align 8
 
+// CIR: cir.func {{.*}}@_Z1fv() {
+// CIR:   %[[D:.+]] = cir.alloca !rec_Derived, !cir.ptr<!rec_Derived>, ["d", init]
+// CIR:   cir.call @_ZN7DerivedC1Ev(%[[D]]) nothrow : (!cir.ptr<!rec_Derived>) -> ()
+// CIR:   %[[VPTR_PTR:.+]] = cir.vtable.get_vptr %[[D]] : !cir.ptr<!rec_Derived> -> !cir.ptr<!cir.vptr>
+// CIR:   %[[VPTR:.+]] = cir.load {{.*}} %[[VPTR_PTR]] : !cir.ptr<!cir.vptr>, !cir.vptr
+// CIR:   %[[VPTR_I8:.+]] = cir.cast(bitcast, %[[VPTR]] : !cir.vptr), !cir.ptr<!u8i>
+// CIR:   %[[NEG32:.+]] = cir.const #cir.int<-32> : !s64i
+// CIR:   %[[ADJ_VPTR_I8:.+]] = cir.ptr_stride(%[[VPTR_I8]] : !cir.ptr<!u8i>, %[[NEG32]] : !s64i), !cir.ptr<!u8i>
+// CIR:   %[[OFFSET_PTR:.+]] = cir.cast(bitcast, %[[ADJ_VPTR_I8]] : !cir.ptr<!u8i>), !cir.ptr<!s64i>
+// CIR:   %[[OFFSET:.+]] = cir.load {{.*}} %[[OFFSET_PTR]] : !cir.ptr<!s64i>, !s64i
+// CIR:   %[[D_I8:.+]] = cir.cast(bitcast, %[[D]] : !cir.ptr<!rec_Derived>), !cir.ptr<!u8i>
+// CIR:   %[[ADJ_THIS_I8:.+]] = cir.ptr_stride(%[[D_I8]] : !cir.ptr<!u8i>, %[[OFFSET]] : !s64i), !cir.ptr<!u8i>
+// CIR:   %[[ADJ_THIS_D:.+]] = cir.cast(bitcast, %[[ADJ_THIS_I8]] : !cir.ptr<!u8i>), !cir.ptr<!rec_Derived>
+// CIR:   %[[BASE_THIS:.+]] = cir.cast(bitcast, %[[ADJ_THIS_D]] : !cir.ptr<!rec_Derived>), !cir.ptr<!rec_Base>
+// CIR:   %[[BASE_VPTR_PTR:.+]] = cir.vtable.get_vptr %[[BASE_THIS]] : !cir.ptr<!rec_Base> -> !cir.ptr<!cir.vptr>
+// CIR:   %[[BASE_VPTR:.+]] = cir.load {{.*}} %[[BASE_VPTR_PTR]] : !cir.ptr<!cir.vptr>, !cir.vptr
+// CIR:   %[[SLOT_PTR:.+]] = cir.vtable.get_virtual_fn_addr %[[BASE_VPTR]][0] : !cir.vptr -> !cir.ptr<!cir.ptr<!cir.func<(!cir.ptr<!rec_Base>)>>>
+// CIR:   %[[FN:.+]] = cir.load {{.*}} %[[SLOT_PTR]] : !cir.ptr<!cir.ptr<!cir.func<(!cir.ptr<!rec_Base>)>>>, !cir.ptr<!cir.func<(!cir.ptr<!rec_Base>)>>
+// CIR:   cir.call %[[FN]](%[[BASE_THIS]]) : (!cir.ptr<!cir.func<(!cir.ptr<!rec_Base>)>>, !cir.ptr<!rec_Base>) -> ()
+// CIR:   cir.return
+
+// CIR: cir.func {{.*}}@_Z1gv() {
+// CIR:   %[[DF:.+]] = cir.alloca !rec_DerivedFinal, !cir.ptr<!rec_DerivedFinal>, ["df", init]
+// CIR:   cir.call @_ZN12DerivedFinalC1Ev(%[[DF]]) nothrow : (!cir.ptr<!rec_DerivedFinal>) -> ()
+// CIR:   %[[BASE_THIS_2:.+]] = cir.base_class_addr %[[DF]] : !cir.ptr<!rec_DerivedFinal> nonnull [0] -> !cir.ptr<!rec_Base>
+// CIR:   %[[BASE_VPTR_PTR_2:.+]] = cir.vtable.get_vptr %[[BASE_THIS_2]] : !cir.ptr<!rec_Base> -> !cir.ptr<!cir.vptr>
+// CIR:   %[[BASE_VPTR_2:.+]] = cir.load {{.*}} %[[BASE_VPTR_PTR_2]] : !cir.ptr<!cir.vptr>, !cir.vptr
+// CIR:   %[[SLOT_PTR_2:.+]] = cir.vtable.get_virtual_fn_addr %[[BASE_VPTR_2]][0] : !cir.vptr -> !cir.ptr<!cir.ptr<!cir.func<(!cir.ptr<!rec_Base>)>>>
+// CIR:   %[[FN_2:.+]] = cir.load {{.*}} %[[SLOT_PTR_2]] : !cir.ptr<!cir.ptr<!cir.func<(!cir.ptr<!rec_Base>)>>>, !cir.ptr<!cir.func<(!cir.ptr<!rec_Base>)>>
+// CIR:   cir.call %[[FN_2]](%[[BASE_THIS_2]]) : (!cir.ptr<!cir.func<(!cir.ptr<!rec_Base>)>>, !cir.ptr<!rec_Base>) -> ()
+// CIR:   cir.return
+
+// LLVM: define {{.*}}void @_Z1fv()
+// LLVM:   %[[D:.+]] = alloca {{.*}}
+// LLVM:   call void @_ZN7DerivedC1Ev(ptr %[[D]])
+// LLVM:   %[[VPTR_ADDR:.+]] = load ptr, ptr %[[D]]
+// LLVM:   %[[NEG32_PTR:.+]] = getelementptr i8, ptr %[[VPTR_ADDR]], i64 -32
+// LLVM:   %[[OFF:.+]] = load i64, ptr %[[NEG32_PTR]]
+// LLVM:   %[[ADJ_THIS:.+]] = getelementptr i8, ptr %[[D]], i64 %[[OFF]]
+// LLVM:   %[[VFN_TAB:.+]] = load ptr, ptr %[[ADJ_THIS]]
+// LLVM:   %[[SLOT0:.+]] = getelementptr inbounds ptr, ptr %[[VFN_TAB]], i32 0
+// LLVM:   %[[VFN:.+]] = load ptr, ptr %[[SLOT0]]
+// LLVM:   call void %[[VFN]](ptr %[[ADJ_THIS]])
+// LLVM:   ret void
+
+// LLVM: define {{.*}}void @_Z1gv()
+// LLVM:   %[[DF:.+]] = alloca {{.*}}
+// LLVM:   call void @_ZN12DerivedFinalC1Ev(ptr %[[DF]])
+// LLVM:   %[[VPTR2:.+]] = load ptr, ptr %[[DF]]
+// LLVM:   %[[SLOT0_2:.+]] = getelementptr inbounds ptr, ptr %[[VPTR2]], i32 0
+// LLVM:   %[[VFN2:.+]] = load ptr, ptr %[[SLOT0_2]]
+// LLVM:   call void %[[VFN2]](ptr %[[DF]])
+// LLVM:   ret void
+
+// OGCG: define {{.*}}void @_Z1fv()
+// OGCG:   %[[D:.+]] = alloca {{.*}}
+// OGCG:   call void @_ZN7DerivedC1Ev(ptr {{.*}} %[[D]])
+// OGCG:   %[[VTABLE:.+]] = load ptr, ptr %[[D]]
+// OGCG:   %[[NEG32_PTR:.+]] = getelementptr i8, ptr %[[VTABLE]], i64 -32
+// OGCG:   %[[OFF:.+]] = load i64, ptr %[[NEG32_PTR]]
+// OGCG:   %[[ADJ_THIS:.+]] = getelementptr inbounds i8, ptr %[[D]], i64 %[[OFF]]
+// OGCG:   call void @_ZN4Base1fEv(ptr {{.*}} %[[ADJ_THIS]])
+// OGCG:   ret void
+
+// OGCG: define {{.*}}void @_Z1gv()
+// OGCG:   %[[DF:.+]] = alloca {{.*}}
+// OGCG:   call void @_ZN12DerivedFinalC1Ev(ptr {{.*}} %[[DF]])
+// OGCG:   call void @_ZN4Base1fEv(ptr {{.*}} %[[DF]])
+// OGCG:   ret void
+
 // Constructor for B
 // CIR: cir.func comdat linkonce_odr @_ZN1BC1Ev(%arg0: !cir.ptr<!rec_B>
 // CIR:   %[[THIS_ADDR:.*]] = cir.alloca !cir.ptr<!rec_B>, !cir.ptr<!cir.ptr<!rec_B>>, ["this", init]
diff --git a/clang/test/CIR/CodeGen/vector-ext.cpp b/clang/test/CIR/CodeGen/vector-ext.cpp
index 8b5379a..8bca48d 100644
--- a/clang/test/CIR/CodeGen/vector-ext.cpp
+++ b/clang/test/CIR/CodeGen/vector-ext.cpp
@@ -1322,3 +1322,23 @@ void logical_not() {
 // OGCG: %[[RESULT:.*]] = icmp eq <4 x i32> %[[TMP_A]], zeroinitializer
 // OGCG: %[[RESULT_VI4:.*]] = sext <4 x i1> %[[RESULT]] to <4 x i32>
 // OGCG: store <4 x i32> %[[RESULT_VI4]], ptr %[[B_ADDR]], align 16
+
+void unary_extension() {
+  vi4 a;
+  vi4 b = __extension__ a;
+}
+
+// CIR: %[[A_ADDR:.*]] = cir.alloca !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>, ["a"]
+// CIR: %[[B_ADDR:.*]] = cir.alloca !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>, ["b", init]
+// CIR: %[[TMP_A:.*]] = cir.load{{.*}} %[[A_ADDR]] : !cir.ptr<!cir.vector<4 x !s32i>>, !cir.vector<4 x !s32i>
+// CIR: cir.store{{.*}} %[[TMP_A]], %[[B_ADDR]] : !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>
+
+// LLVM: %[[A_ADDR:.*]] = alloca <4 x i32>, i64 1, align 16
+// LLVM: %[[B_ADDR:.*]] = alloca <4 x i32>, i64 1, align 16
+// LLVM: %[[TMP_A:.*]] = load <4 x i32>, ptr %[[A_ADDR]], align 16
+// LLVM: store <4 x i32> %[[TMP_A]], ptr %[[B_ADDR]], align 16
+
+// OGCG: %[[A_ADDR:.*]] = alloca <4 x i32>, align 16
+// OGCG: %[[B_ADDR:.*]] = alloca <4 x i32>, align 16
+// OGCG: %[[TMP_A:.*]] = load <4 x i32>, ptr %[[A_ADDR]], align 16
+// OGCG: store <4 x i32> %[[TMP_A]], ptr %[[B_ADDR]], align 16
diff --git a/clang/test/CIR/CodeGen/vector.cpp b/clang/test/CIR/CodeGen/vector.cpp
index d8fdeea..f242779 100644
--- a/clang/test/CIR/CodeGen/vector.cpp
+++ b/clang/test/CIR/CodeGen/vector.cpp
@@ -1390,3 +1390,23 @@ void logical_not_float() {
 // OGCG: %[[RESULT:.*]] = fcmp oeq <4 x float> %[[TMP_A]], zeroinitializer
 // OGCG: %[[RESULT_VI4:.*]] = sext <4 x i1> %[[RESULT]] to <4 x i32>
 // OGCG: store <4 x i32> %[[RESULT_VI4]], ptr %[[B_ADDR]], align 16
+
+void unary_extension() {
+  vi4 a;
+  vi4 b = __extension__ a;
+}
+
+// CIR: %[[A_ADDR:.*]] = cir.alloca !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>, ["a"]
+// CIR: %[[B_ADDR:.*]] = cir.alloca !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>, ["b", init]
+// CIR: %[[TMP_A:.*]] = cir.load{{.*}} %[[A_ADDR]] : !cir.ptr<!cir.vector<4 x !s32i>>, !cir.vector<4 x !s32i>
+// CIR: cir.store{{.*}} %[[TMP_A]], %[[B_ADDR]] : !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>
+
+// LLVM: %[[A_ADDR:.*]] = alloca <4 x i32>, i64 1, align 16
+// LLVM: %[[B_ADDR:.*]] = alloca <4 x i32>, i64 1, align 16
+// LLVM: %[[TMP_A:.*]] = load <4 x i32>, ptr %[[A_ADDR]], align 16
+// LLVM: store <4 x i32> %[[TMP_A]], ptr %[[B_ADDR]], align 16
+
+// OGCG: %[[A_ADDR:.*]] = alloca <4 x i32>, align 16
+// OGCG: %[[B_ADDR:.*]] = alloca <4 x i32>, align 16
+// OGCG: %[[TMP_A:.*]] = load <4 x i32>, ptr %[[A_ADDR]], align 16
+// OGCG: store <4 x i32> %[[TMP_A]], ptr %[[B_ADDR]], align 16
diff --git a/clang/test/CIR/IR/global-init.cir b/clang/test/CIR/IR/global-init.cir
new file mode 100644
index 0000000..727c067
--- /dev/null
+++ b/clang/test/CIR/IR/global-init.cir
@@ -0,0 +1,48 @@
+// RUN: cir-opt --verify-roundtrip %s -o - | FileCheck %s
+
+!u8i = !cir.int<u, 8>
+
+!rec_NeedsCtor = !cir.record<struct "NeedsCtor" padded {!u8i}>
+!rec_NeedsDtor = !cir.record<struct "NeedsDtor" padded {!u8i}>
+!rec_NeedsCtorDtor = !cir.record<struct "NeedsCtorDtor" padded {!u8i}>
+
+module attributes {cir.triple = "x86_64-unknown-linux-gnu"} {
+  cir.func private @_ZN9NeedsCtorC1Ev(!cir.ptr<!rec_NeedsCtor>)
+  cir.global external @needsCtor = ctor : !rec_NeedsCtor {
+    %0 = cir.get_global @needsCtor : !cir.ptr<!rec_NeedsCtor>
+    cir.call @_ZN9NeedsCtorC1Ev(%0) : (!cir.ptr<!rec_NeedsCtor>) -> ()
+  }
+  // CHECK: cir.global external @needsCtor = ctor : !rec_NeedsCtor {
+  // CHECK:   %0 = cir.get_global @needsCtor : !cir.ptr<!rec_NeedsCtor>
+  // CHECK:   cir.call @_ZN9NeedsCtorC1Ev(%0) : (!cir.ptr<!rec_NeedsCtor>) -> ()
+  // CHECK: }
+
+  cir.func private @_ZN9NeedsDtorD1Ev(!cir.ptr<!rec_NeedsDtor>)
+  cir.global external dso_local @needsDtor = #cir.zero : !rec_NeedsDtor dtor {
+    %0 = cir.get_global @needsDtor : !cir.ptr<!rec_NeedsDtor>
+    cir.call @_ZN9NeedsDtorD1Ev(%0) : (!cir.ptr<!rec_NeedsDtor>) -> ()
+  }
+  // CHECK: cir.global external dso_local @needsDtor = #cir.zero : !rec_NeedsDtor dtor {
+  // CHECK:   %0 = cir.get_global @needsDtor : !cir.ptr<!rec_NeedsDtor>
+  // CHECK:   cir.call @_ZN9NeedsDtorD1Ev(%0) : (!cir.ptr<!rec_NeedsDtor>) -> ()
+  // CHECK: }
+
+  cir.func private @_ZN13NeedsCtorDtorC1Ev(!cir.ptr<!rec_NeedsCtorDtor>)
+  cir.func private @_ZN13NeedsCtorDtorD1Ev(!cir.ptr<!rec_NeedsCtorDtor>)
+  cir.global external dso_local @needsCtorDtor = ctor : !rec_NeedsCtorDtor {
+    %0 = cir.get_global @needsCtorDtor : !cir.ptr<!rec_NeedsCtorDtor>
+    cir.call @_ZN13NeedsCtorDtorC1Ev(%0) : (!cir.ptr<!rec_NeedsCtorDtor>) -> ()
+  } dtor {
+    %0 = cir.get_global @needsCtorDtor : !cir.ptr<!rec_NeedsCtorDtor>
+    cir.call @_ZN13NeedsCtorDtorD1Ev(%0) : (!cir.ptr<!rec_NeedsCtorDtor>) -> ()
+  }
+  // CHECK: cir.func private @_ZN13NeedsCtorDtorC1Ev(!cir.ptr<!rec_NeedsCtorDtor>)
+  // CHECK: cir.func private @_ZN13NeedsCtorDtorD1Ev(!cir.ptr<!rec_NeedsCtorDtor>)
+  // CHECK: cir.global external dso_local @needsCtorDtor = ctor : !rec_NeedsCtorDtor {
+  // CHECK:   %0 = cir.get_global @needsCtorDtor : !cir.ptr<!rec_NeedsCtorDtor>
+  // CHECK:   cir.call @_ZN13NeedsCtorDtorC1Ev(%0) : (!cir.ptr<!rec_NeedsCtorDtor>) -> ()
+  // CHECK: } dtor {
+  // CHECK:   %0 = cir.get_global @needsCtorDtor : !cir.ptr<!rec_NeedsCtorDtor>
+  // CHECK:   cir.call @_ZN13NeedsCtorDtorD1Ev(%0) : (!cir.ptr<!rec_NeedsCtorDtor>) -> ()
+  // CHECK: }
+}
diff --git a/clang/test/CodeGen/X86/avx-builtins.c b/clang/test/CodeGen/X86/avx-builtins.c
index 347cd9e..3018bb97 100644
--- a/clang/test/CodeGen/X86/avx-builtins.c
+++ b/clang/test/CodeGen/X86/avx-builtins.c
@@ -985,18 +985,21 @@ double test_mm256_cvtsd_f64(__m256d __a) {
   // CHECK: extractelement <4 x double> %{{.*}}, i32 0
   return _mm256_cvtsd_f64(__a);
 }
+TEST_CONSTEXPR(_mm256_cvtsd_f64((__m256d){8.0, 7.0, 6.0, 5.0}) == 8.0);
 
 int test_mm256_cvtsi256_si32(__m256i __a) {
   // CHECK-LABEL: test_mm256_cvtsi256_si32
   // CHECK: extractelement <8 x i32> %{{.*}}, i32 0
   return _mm256_cvtsi256_si32(__a);
 }
+TEST_CONSTEXPR(_mm256_cvtsi256_si32((__m256i)(__v8si){8, 7, 6, 5, 4, 3, 2, 1}) == 8);
 
 float test_mm256_cvtss_f32(__m256 __a) {
   // CHECK-LABEL: test_mm256_cvtss_f32
   // CHECK: extractelement <8 x float> %{{.*}}, i32 0
   return _mm256_cvtss_f32(__a);
 }
+TEST_CONSTEXPR(_mm256_cvtss_f32((__m256){8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f}) == 8.0f);
 
 __m128i test_mm256_cvttpd_epi32(__m256d A) {
   // CHECK-LABEL: test_mm256_cvttpd_epi32
diff --git a/clang/test/CodeGen/X86/bmi-builtins.c b/clang/test/CodeGen/X86/bmi-builtins.c
index ded40ca..d0ae0c7 100644
--- a/clang/test/CodeGen/X86/bmi-builtins.c
+++ b/clang/test/CodeGen/X86/bmi-builtins.c
@@ -1,7 +1,16 @@
-// RUN: %clang_cc1 -x c -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +bmi -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK,TZCNT
-// RUN: %clang_cc1 -x c -fms-extensions -fms-compatibility -fms-compatibility-version=17.00 -ffreestanding %s -triple=x86_64-windows-msvc -emit-llvm -o - -Wall -Werror -DTEST_TZCNT | FileCheck %s --check-prefix=TZCNT
-// RUN: %clang_cc1 -x c++ -std=c++11 -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +bmi -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK,TZCNT
-// RUN: %clang_cc1 -x c++ -std=c++11 -fms-extensions -fms-compatibility -fms-compatibility-version=17.00 -ffreestanding %s -triple=x86_64-windows-msvc -emit-llvm -o - -Wall -Werror -DTEST_TZCNT | FileCheck %s --check-prefix=TZCNT
+// RUN: %clang_cc1 -x c -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +bmi -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK,X64,TZCNT,TZCNT64
+// RUN: %clang_cc1 -x c -ffreestanding %s -triple=i386-apple-darwin -target-feature +bmi -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK,TZCNT
+// RUN: %clang_cc1 -x c -fms-extensions -fms-compatibility -fms-compatibility-version=17.00 -ffreestanding %s -triple=x86_64-windows-msvc -emit-llvm -o - -Wall -Werror -DTEST_TZCNT | FileCheck %s --check-prefixes=TZCNT,TZCNT64
+// RUN: %clang_cc1 -x c++ -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +bmi -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK,X64,TZCNT,TZCNT64
+// RUN: %clang_cc1 -x c++ -ffreestanding %s -triple=i386-apple-darwin -target-feature +bmi -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK,TZCNT
+// RUN: %clang_cc1 -x c++ -fms-extensions -fms-compatibility -fms-compatibility-version=17.00 -ffreestanding %s -triple=x86_64-windows-msvc -emit-llvm -o - -Wall -Werror -DTEST_TZCNT | FileCheck %s --check-prefixes=TZCNT,TZCNT64
+
+// RUN: %clang_cc1 -x c -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +bmi -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s --check-prefixes=CHECK,X64,TZCNT,TZCNT64
+// RUN: %clang_cc1 -x c -ffreestanding %s -triple=i386-apple-darwin -target-feature +bmi -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s --check-prefixes=CHECK,TZCNT
+// RUN: %clang_cc1 -x c -fms-extensions -fms-compatibility -fms-compatibility-version=17.00 -ffreestanding %s -triple=x86_64-windows-msvc -emit-llvm -o - -Wall -Werror -DTEST_TZCNT -fexperimental-new-constant-interpreter | FileCheck %s --check-prefixes=TZCNT,TZCNT64
+// RUN: %clang_cc1 -x c++ -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +bmi -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s --check-prefixes=CHECK,X64,TZCNT,TZCNT64
+// RUN: %clang_cc1 -x c++ -ffreestanding %s -triple=i386-apple-darwin -target-feature +bmi -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s --check-prefixes=CHECK,TZCNT
+// RUN: %clang_cc1 -x c++ -fms-extensions -fms-compatibility -fms-compatibility-version=17.00 -ffreestanding %s -triple=x86_64-windows-msvc -emit-llvm -o - -Wall -Werror -DTEST_TZCNT -fexperimental-new-constant-interpreter | FileCheck %s --check-prefixes=TZCNT,TZCNT64
 
 
 #include <immintrin.h>
@@ -48,20 +57,20 @@ unsigned int test_tzcnt_u32(unsigned int __X) {
 
 #ifdef __x86_64__
 unsigned long long test__tzcnt_u64(unsigned long long __X) {
-// TZCNT-LABEL: test__tzcnt_u64
-// TZCNT: i64 @llvm.cttz.i64(i64 %{{.*}}, i1 false)
+// TZCNT64-LABEL: test__tzcnt_u64
+// TZCNT64: i64 @llvm.cttz.i64(i64 %{{.*}}, i1 false)
   return __tzcnt_u64(__X);
 }
 
 long long test_mm_tzcnt_64(unsigned long long __X) {
-// TZCNT-LABEL: test_mm_tzcnt_64
-// TZCNT: i64 @llvm.cttz.i64(i64 %{{.*}}, i1 false)
+// TZCNT64-LABEL: test_mm_tzcnt_64
+// TZCNT64: i64 @llvm.cttz.i64(i64 %{{.*}}, i1 false)
   return _mm_tzcnt_64(__X);
 }
 
 unsigned long long test_tzcnt_u64(unsigned long long __X) {
-// TZCNT-LABEL: test_tzcnt_u64
-// TZCNT: i64 @llvm.cttz.i64(i64 %{{.*}}, i1 false)
+// TZCNT64-LABEL: test_tzcnt_u64
+// TZCNT64: i64 @llvm.cttz.i64(i64 %{{.*}}, i1 false)
   return _tzcnt_u64(__X);
 }
 #endif
@@ -103,36 +112,36 @@ unsigned int test__blsr_u32(unsigned int __X) {
 
 #ifdef __x86_64__
 unsigned long long test__andn_u64(unsigned long __X, unsigned long __Y) {
-// CHECK-LABEL: test__andn_u64
-// CHECK: xor i64 %{{.*}}, -1
-// CHECK: and i64 %{{.*}}, %{{.*}}
+// X64-LABEL: test__andn_u64
+// X64: xor i64 %{{.*}}, -1
+// X64: and i64 %{{.*}}, %{{.*}}
   return __andn_u64(__X, __Y);
 }
 
 unsigned long long test__bextr_u64(unsigned long __X, unsigned long __Y) {
-// CHECK-LABEL: test__bextr_u64
-// CHECK: i64 @llvm.x86.bmi.bextr.64(i64 %{{.*}}, i64 %{{.*}})
+// X64-LABEL: test__bextr_u64
+// X64: i64 @llvm.x86.bmi.bextr.64(i64 %{{.*}}, i64 %{{.*}})
   return __bextr_u64(__X, __Y);
 }
 
 unsigned long long test__blsi_u64(unsigned long long __X) {
-// CHECK-LABEL: test__blsi_u64
-// CHECK: sub i64 0, %{{.*}}
-// CHECK: and i64 %{{.*}}, %{{.*}}
+// X64-LABEL: test__blsi_u64
+// X64: sub i64 0, %{{.*}}
+// X64: and i64 %{{.*}}, %{{.*}}
   return __blsi_u64(__X);
 }
 
 unsigned long long test__blsmsk_u64(unsigned long long __X) {
-// CHECK-LABEL: test__blsmsk_u64
-// CHECK: sub i64 %{{.*}}, 1
-// CHECK: xor i64 %{{.*}}, %{{.*}}
+// X64-LABEL: test__blsmsk_u64
+// X64: sub i64 %{{.*}}, 1
+// X64: xor i64 %{{.*}}, %{{.*}}
   return __blsmsk_u64(__X);
 }
 
 unsigned long long test__blsr_u64(unsigned long long __X) {
-// CHECK-LABEL: test__blsr_u64
-// CHECK: sub i64 %{{.*}}, 1
-// CHECK: and i64 %{{.*}}, %{{.*}}
+// X64-LABEL: test__blsr_u64
+// X64: sub i64 %{{.*}}, 1
+// X64: and i64 %{{.*}}, %{{.*}}
   return __blsr_u64(__X);
 }
 #endif
@@ -186,49 +195,49 @@ unsigned int test_blsr_u32(unsigned int __X) {
 
 #ifdef __x86_64__
 unsigned long long test_andn_u64(unsigned long __X, unsigned long __Y) {
-// CHECK-LABEL: test_andn_u64
-// CHECK: xor i64 %{{.*}}, -1
-// CHECK: and i64 %{{.*}}, %{{.*}}
+// X64-LABEL: test_andn_u64
+// X64: xor i64 %{{.*}}, -1
+// X64: and i64 %{{.*}}, %{{.*}}
   return _andn_u64(__X, __Y);
 }
 
 unsigned long long test_bextr_u64(unsigned long __X, unsigned int __Y,
                                   unsigned int __Z) {
-// CHECK-LABEL: test_bextr_u64
-// CHECK: and i32 %{{.*}}, 255
-// CHECK: and i32 %{{.*}}, 255
-// CHECK: shl i32 %{{.*}}, 8
-// CHECK: or i32 %{{.*}}, %{{.*}}
-// CHECK: zext i32 %{{.*}} to i64
-// CHECK: i64 @llvm.x86.bmi.bextr.64(i64 %{{.*}}, i64 %{{.*}})
+// X64-LABEL: test_bextr_u64
+// X64: and i32 %{{.*}}, 255
+// X64: and i32 %{{.*}}, 255
+// X64: shl i32 %{{.*}}, 8
+// X64: or i32 %{{.*}}, %{{.*}}
+// X64: zext i32 %{{.*}} to i64
+// X64: i64 @llvm.x86.bmi.bextr.64(i64 %{{.*}}, i64 %{{.*}})
   return _bextr_u64(__X, __Y, __Z);
 }
 
 unsigned long long test_bextr2_u64(unsigned long long __X,
                                    unsigned long long __Y) {
-// CHECK-LABEL: test_bextr2_u64
-// CHECK: i64 @llvm.x86.bmi.bextr.64(i64 %{{.*}}, i64 %{{.*}})
+// X64-LABEL: test_bextr2_u64
+// X64: i64 @llvm.x86.bmi.bextr.64(i64 %{{.*}}, i64 %{{.*}})
   return _bextr2_u64(__X, __Y);
 }
 
 unsigned long long test_blsi_u64(unsigned long long __X) {
-// CHECK-LABEL: test_blsi_u64
-// CHECK: sub i64 0, %{{.*}}
-// CHECK: and i64 %{{.*}}, %{{.*}}
+// X64-LABEL: test_blsi_u64
+// X64: sub i64 0, %{{.*}}
+// X64: and i64 %{{.*}}, %{{.*}}
   return _blsi_u64(__X);
 }
 
 unsigned long long test_blsmsk_u64(unsigned long long __X) {
-// CHECK-LABEL: test_blsmsk_u64
-// CHECK: sub i64 %{{.*}}, 1
-// CHECK: xor i64 %{{.*}}, %{{.*}}
+// X64-LABEL: test_blsmsk_u64
+// X64: sub i64 %{{.*}}, 1
+// X64: xor i64 %{{.*}}, %{{.*}}
   return _blsmsk_u64(__X);
 }
 
 unsigned long long test_blsr_u64(unsigned long long __X) {
-// CHECK-LABEL: test_blsr_u64
-// CHECK: sub i64 %{{.*}}, 1
-// CHECK: and i64 %{{.*}}, %{{.*}}
+// X64-LABEL: test_blsr_u64
+// X64: sub i64 %{{.*}}, 1
+// X64: and i64 %{{.*}}, %{{.*}}
   return _blsr_u64(__X);
 }
 #endif
diff --git a/clang/test/CodeGen/X86/bmi2-builtins.c b/clang/test/CodeGen/X86/bmi2-builtins.c
index 48424f5..1b2cb90 100644
--- a/clang/test/CodeGen/X86/bmi2-builtins.c
+++ b/clang/test/CodeGen/X86/bmi2-builtins.c
@@ -3,6 +3,11 @@
 // RUN: %clang_cc1 -x c++ -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +bmi2 -emit-llvm -o - | FileCheck %s
 // RUN: %clang_cc1 -x c++ -ffreestanding %s -triple=i386-apple-darwin -target-feature +bmi2 -emit-llvm -o - | FileCheck %s --check-prefix=B32
 
+// RUN: %clang_cc1 -x c -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +bmi2 -emit-llvm -o - -fexperimental-new-constant-interpreter | FileCheck %s
+// RUN: %clang_cc1 -x c -ffreestanding %s -triple=i386-apple-darwin -target-feature +bmi2 -emit-llvm -o - -fexperimental-new-constant-interpreter | FileCheck %s --check-prefix=B32
+// RUN: %clang_cc1 -x c++ -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +bmi2 -emit-llvm -o - -fexperimental-new-constant-interpreter | FileCheck %s
+// RUN: %clang_cc1 -x c++ -ffreestanding %s -triple=i386-apple-darwin -target-feature +bmi2 -emit-llvm -o - -fexperimental-new-constant-interpreter | FileCheck %s --check-prefix=B32
+
 
 #include <immintrin.h>
 
diff --git a/clang/test/CodeGen/X86/tbm-builtins.c b/clang/test/CodeGen/X86/tbm-builtins.c
index d916627..89746bf 100644
--- a/clang/test/CodeGen/X86/tbm-builtins.c
+++ b/clang/test/CodeGen/X86/tbm-builtins.c
@@ -1,5 +1,12 @@
-// RUN: %clang_cc1 -x c -ffreestanding %s -triple=x86_64-unknown-unknown -target-feature +tbm -emit-llvm -o - | FileCheck %s
-// RUN: %clang_cc1 -x c++ -std=c++11 -ffreestanding %s -triple=x86_64-unknown-unknown -target-feature +tbm -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -x c -ffreestanding %s -triple=x86_64-unknown-unknown -target-feature +tbm -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,X64
+// RUN: %clang_cc1 -x c -ffreestanding %s -triple=i386-unknown-unknown -target-feature +tbm -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK
+// RUN: %clang_cc1 -x c++ -ffreestanding %s -triple=x86_64-unknown-unknown -target-feature +tbm -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,X64
+// RUN: %clang_cc1 -x c++ -ffreestanding %s -triple=i386-unknown-unknown -target-feature +tbm -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK
+
+// RUN: %clang_cc1 -x c -ffreestanding %s -triple=x86_64-unknown-unknown -target-feature +tbm -emit-llvm -o - -fexperimental-new-constant-interpreter | FileCheck %s --check-prefixes=CHECK,X64
+// RUN: %clang_cc1 -x c -ffreestanding %s -triple=i386-unknown-unknown -target-feature +tbm -emit-llvm -o - -fexperimental-new-constant-interpreter | FileCheck %s --check-prefixes=CHECK
+// RUN: %clang_cc1 -x c++ -ffreestanding %s -triple=x86_64-unknown-unknown -target-feature +tbm -emit-llvm -o - -fexperimental-new-constant-interpreter | FileCheck %s --check-prefixes=CHECK,X64
+// RUN: %clang_cc1 -x c++ -ffreestanding %s -triple=i386-unknown-unknown -target-feature +tbm -emit-llvm -o - -fexperimental-new-constant-interpreter | FileCheck %s --check-prefixes=CHECK
 
 #include <x86intrin.h>
 
@@ -13,14 +20,14 @@ unsigned int test__bextri_u32(unsigned int a) {
 
 #ifdef __x86_64__
 unsigned long long test__bextri_u64(unsigned long long a) {
-  // CHECK-LABEL: test__bextri_u64
-  // CHECK: call i64 @llvm.x86.tbm.bextri.u64(i64 %{{.*}}, i64 2)
+  // X64-LABEL: test__bextri_u64
+  // X64: call i64 @llvm.x86.tbm.bextri.u64(i64 %{{.*}}, i64 2)
   return __bextri_u64(a, 2);
 }
 
 unsigned long long test__bextri_u64_bigint(unsigned long long a) {
-  // CHECK-LABEL: test__bextri_u64_bigint
-  // CHECK: call i64 @llvm.x86.tbm.bextri.u64(i64 %{{.*}}, i64 549755813887)
+  // X64-LABEL: test__bextri_u64_bigint
+  // X64: call i64 @llvm.x86.tbm.bextri.u64(i64 %{{.*}}, i64 549755813887)
   return __bextri_u64(a, 0x7fffffffffLL);
 }
 #endif
@@ -34,9 +41,9 @@ unsigned int test__blcfill_u32(unsigned int a) {
 
 #ifdef __x86_64__
 unsigned long long test__blcfill_u64(unsigned long long a) {
-  // CHECK-LABEL: test__blcfill_u64
-  // CHECK: [[TMP:%.*]] = add i64 %{{.*}}, 1
-  // CHECK: %{{.*}} = and i64 %{{.*}}, [[TMP]]
+  // X64-LABEL: test__blcfill_u64
+  // X64: [[TMP:%.*]] = add i64 %{{.*}}, 1
+  // X64: %{{.*}} = and i64 %{{.*}}, [[TMP]]
   return __blcfill_u64(a);
 }
 #endif
@@ -51,10 +58,10 @@ unsigned int test__blci_u32(unsigned int a) {
 
 #ifdef __x86_64__
 unsigned long long test__blci_u64(unsigned long long a) {
-  // CHECK-LABEL: test__blci_u64
-  // CHECK: [[TMP1:%.*]] = add i64 %{{.*}}, 1
-  // CHECK: [[TMP2:%.*]] = xor i64 [[TMP1]], -1
-  // CHECK: %{{.*}} = or i64 %{{.*}}, [[TMP2]]
+  // X64-LABEL: test__blci_u64
+  // X64: [[TMP1:%.*]] = add i64 %{{.*}}, 1
+  // X64: [[TMP2:%.*]] = xor i64 [[TMP1]], -1
+  // X64: %{{.*}} = or i64 %{{.*}}, [[TMP2]]
   return __blci_u64(a);
 }
 #endif
@@ -69,10 +76,10 @@ unsigned int test__blcic_u32(unsigned int a) {
 
 #ifdef __x86_64__
 unsigned long long test__blcic_u64(unsigned long long a) {
-  // CHECK-LABEL: test__blcic_u64
-  // CHECK: [[TMP1:%.*]] = xor i64 %{{.*}}, -1
-  // CHECK: [[TMP2:%.*]] = add i64 %{{.*}}, 1
-  // CHECK-NEXT: {{.*}} = and i64 [[TMP1]], [[TMP2]]
+  // X64-LABEL: test__blcic_u64
+  // X64: [[TMP1:%.*]] = xor i64 %{{.*}}, -1
+  // X64: [[TMP2:%.*]] = add i64 %{{.*}}, 1
+  // X64-NEXT: {{.*}} = and i64 [[TMP1]], [[TMP2]]
   return __blcic_u64(a);
 }
 #endif
@@ -86,9 +93,9 @@ unsigned int test__blcmsk_u32(unsigned int a) {
 
 #ifdef __x86_64__
 unsigned long long test__blcmsk_u64(unsigned long long a) {
-  // CHECK-LABEL: test__blcmsk_u64
-  // CHECK: [[TMP:%.*]] = add i64 %{{.*}}, 1
-  // CHECK-NEXT: {{.*}} = xor i64 %{{.*}}, [[TMP]]
+  // X64-LABEL: test__blcmsk_u64
+  // X64: [[TMP:%.*]] = add i64 %{{.*}}, 1
+  // X64-NEXT: {{.*}} = xor i64 %{{.*}}, [[TMP]]
   return __blcmsk_u64(a);
 }
 #endif
@@ -102,9 +109,9 @@ unsigned int test__blcs_u32(unsigned int a) {
 
 #ifdef __x86_64__
 unsigned long long test__blcs_u64(unsigned long long a) {
-  // CHECK-LABEL: test__blcs_u64
-  // CHECK: [[TMP:%.*]] = add i64 %{{.*}}, 1
-  // CHECK-NEXT: {{.*}} = or i64 %{{.*}}, [[TMP]]
+  // X64-LABEL: test__blcs_u64
+  // X64: [[TMP:%.*]] = add i64 %{{.*}}, 1
+  // X64-NEXT: {{.*}} = or i64 %{{.*}}, [[TMP]]
   return __blcs_u64(a);
 }
 #endif
@@ -118,9 +125,9 @@ unsigned int test__blsfill_u32(unsigned int a) {
 
 #ifdef __x86_64__
 unsigned long long test__blsfill_u64(unsigned long long a) {
-  // CHECK-LABEL: test__blsfill_u64
-  // CHECK: [[TMP:%.*]] = sub i64 %{{.*}}, 1
-  // CHECK-NEXT: {{.*}} = or i64 %{{.*}}, [[TMP]]
+  // X64-LABEL: test__blsfill_u64
+  // X64: [[TMP:%.*]] = sub i64 %{{.*}}, 1
+  // X64-NEXT: {{.*}} = or i64 %{{.*}}, [[TMP]]
   return __blsfill_u64(a);
 }
 #endif
@@ -135,10 +142,10 @@ unsigned int test__blsic_u32(unsigned int a) {
 
 #ifdef __x86_64__
 unsigned long long test__blsic_u64(unsigned long long a) {
-  // CHECK-LABEL: test__blsic_u64
-  // CHECK: [[TMP1:%.*]] = xor i64 %{{.*}}, -1
-  // CHECK: [[TMP2:%.*]] = sub i64 %{{.*}}, 1
-  // CHECK-NEXT: {{.*}} = or i64 [[TMP1]], [[TMP2]]
+  // X64-LABEL: test__blsic_u64
+  // X64: [[TMP1:%.*]] = xor i64 %{{.*}}, -1
+  // X64: [[TMP2:%.*]] = sub i64 %{{.*}}, 1
+  // X64-NEXT: {{.*}} = or i64 [[TMP1]], [[TMP2]]
   return __blsic_u64(a);
 }
 #endif
@@ -153,10 +160,10 @@ unsigned int test__t1mskc_u32(unsigned int a) {
 
 #ifdef __x86_64__
 unsigned long long test__t1mskc_u64(unsigned long long a) {
-  // CHECK-LABEL: test__t1mskc_u64
-  // CHECK: [[TMP1:%.*]] = xor i64 %{{.*}}, -1
-  // CHECK: [[TMP2:%.*]] = add i64 %{{.*}}, 1
-  // CHECK-NEXT: {{.*}} = or i64 [[TMP1]], [[TMP2]]
+  // X64-LABEL: test__t1mskc_u64
+  // X64: [[TMP1:%.*]] = xor i64 %{{.*}}, -1
+  // X64: [[TMP2:%.*]] = add i64 %{{.*}}, 1
+  // X64-NEXT: {{.*}} = or i64 [[TMP1]], [[TMP2]]
   return __t1mskc_u64(a);
 }
 #endif
@@ -171,10 +178,10 @@ unsigned int test__tzmsk_u32(unsigned int a) {
 
 #ifdef __x86_64__
 unsigned long long test__tzmsk_u64(unsigned long long a) {
-  // CHECK-LABEL: test__tzmsk_u64
-  // CHECK: [[TMP1:%.*]] = xor i64 %{{.*}}, -1
-  // CHECK: [[TMP2:%.*]] = sub i64 %{{.*}}, 1
-  // CHECK-NEXT: {{.*}} = and i64 [[TMP1]], [[TMP2]]
+  // X64-LABEL: test__tzmsk_u64
+  // X64: [[TMP1:%.*]] = xor i64 %{{.*}}, -1
+  // X64: [[TMP2:%.*]] = sub i64 %{{.*}}, 1
+  // X64-NEXT: {{.*}} = and i64 [[TMP1]], [[TMP2]]
   return __tzmsk_u64(a);
 }
 #endif
diff --git a/clang/test/CodeGen/amdgpu-image-rsrc-type-debug-info.c b/clang/test/CodeGen/amdgpu-image-rsrc-type-debug-info.c
new file mode 100644
index 0000000..ef68c79
--- /dev/null
+++ b/clang/test/CodeGen/amdgpu-image-rsrc-type-debug-info.c
@@ -0,0 +1,17 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: amdgpu-registered-target
+// RUN: %clang_cc1 -triple amdgcn -emit-llvm -o - %s -debug-info-kind=limited | FileCheck %s
+
+// CHECK-LABEL: define dso_local void @test_locals(
+// CHECK-SAME: ) #[[ATTR0:[0-9]+]] !dbg [[DBG6:![0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[IMG:%.*]] = alloca ptr, align 32, addrspace(5)
+// CHECK-NEXT:    [[IMG_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[IMG]] to ptr
+// CHECK-NEXT:      #dbg_declare(ptr addrspace(5) [[IMG]], [[META11:![0-9]+]], !DIExpression(DW_OP_constu, 1, DW_OP_swap, DW_OP_xderef), [[META14:![0-9]+]])
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[IMG_ASCAST]], align 32, !dbg [[DBG15:![0-9]+]]
+// CHECK-NEXT:    ret void, !dbg [[DBG16:![0-9]+]]
+//
+void test_locals(void) {
+  __amdgpu_texture_t img;
+  (void)img;
+}
diff --git a/clang/test/CodeGenCXX/amdgpu-image-rsrc-typeinfo.cpp b/clang/test/CodeGenCXX/amdgpu-image-rsrc-typeinfo.cpp
new file mode 100644
index 0000000..0dbd517
--- /dev/null
+++ b/clang/test/CodeGenCXX/amdgpu-image-rsrc-typeinfo.cpp
@@ -0,0 +1,7 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: amdgpu-registered-target
+// RUN: %clang_cc1 -triple amdgcn %s -emit-llvm -o - | FileCheck %s
+namespace std { class type_info; }
+auto &a = typeid(__amdgpu_texture_t);
+//// NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+// CHECK: {{.*}}
diff --git a/clang/test/CodeGenCXX/builtin-amdgcn-atomic-inc-dec.cpp b/clang/test/CodeGenCXX/builtin-amdgcn-atomic-inc-dec.cpp
index 5920ced..137a49b 100644
--- a/clang/test/CodeGenCXX/builtin-amdgcn-atomic-inc-dec.cpp
+++ b/clang/test/CodeGenCXX/builtin-amdgcn-atomic-inc-dec.cpp
@@ -1,7 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: amdgpu-registered-target
+// REQUIRES: spirv-registered-target
 // RUN: %clang_cc1 %s -x hip -fcuda-is-device -emit-llvm -O0 -o - \
-// RUN:   -triple=amdgcn-amd-amdhsa | FileCheck %s
+// RUN:   -triple=amdgcn-amd-amdhsa | FileCheck --check-prefix=GCN %s
+// RUN: %clang_cc1 %s -x hip -fcuda-is-device -emit-llvm -O0 -o - \
+// RUN:   -triple=spirv64-amd-amdhsa | FileCheck --check-prefix=AMDGCNSPIRV %s
 
 // CHECK-LABEL: @_Z29test_non_volatile_parameter32Pj(
 // CHECK-NEXT:  entry:
@@ -21,6 +24,43 @@
 // CHECK-NEXT:    [[TMP7:%.*]] = atomicrmw udec_wrap ptr [[TMP4]], i32 [[TMP6]] syncscope("workgroup") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META4]]
 // CHECK-NEXT:    store i32 [[TMP7]], ptr [[RES_ASCAST]], align 4
 // CHECK-NEXT:    ret void
+// GCN-LABEL: @_Z29test_non_volatile_parameter32Pj(
+// GCN-NEXT:  entry:
+// GCN-NEXT:    [[PTR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// GCN-NEXT:    [[RES:%.*]] = alloca i32, align 4, addrspace(5)
+// GCN-NEXT:    [[PTR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[PTR_ADDR]] to ptr
+// GCN-NEXT:    [[RES_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RES]] to ptr
+// GCN-NEXT:    store ptr [[PTR:%.*]], ptr [[PTR_ADDR_ASCAST]], align 8
+// GCN-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR_ASCAST]], align 8
+// GCN-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[PTR_ADDR_ASCAST]], align 8
+// GCN-NEXT:    [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4
+// GCN-NEXT:    [[TMP3:%.*]] = atomicrmw uinc_wrap ptr [[TMP0]], i32 [[TMP2]] syncscope("workgroup") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META4:![0-9]+]]
+// GCN-NEXT:    store i32 [[TMP3]], ptr [[RES_ASCAST]], align 4
+// GCN-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[PTR_ADDR_ASCAST]], align 8
+// GCN-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[PTR_ADDR_ASCAST]], align 8
+// GCN-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4
+// GCN-NEXT:    [[TMP7:%.*]] = atomicrmw udec_wrap ptr [[TMP4]], i32 [[TMP6]] syncscope("workgroup") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META4]]
+// GCN-NEXT:    store i32 [[TMP7]], ptr [[RES_ASCAST]], align 4
+// GCN-NEXT:    ret void
+//
+// AMDGCNSPIRV-LABEL: @_Z29test_non_volatile_parameter32Pj(
+// AMDGCNSPIRV-NEXT:  entry:
+// AMDGCNSPIRV-NEXT:    [[PTR_ADDR:%.*]] = alloca ptr addrspace(4), align 8
+// AMDGCNSPIRV-NEXT:    [[RES:%.*]] = alloca i32, align 4
+// AMDGCNSPIRV-NEXT:    [[PTR_ADDR_ASCAST:%.*]] = addrspacecast ptr [[PTR_ADDR]] to ptr addrspace(4)
+// AMDGCNSPIRV-NEXT:    [[RES_ASCAST:%.*]] = addrspacecast ptr [[RES]] to ptr addrspace(4)
+// AMDGCNSPIRV-NEXT:    store ptr addrspace(4) [[PTR:%.*]], ptr addrspace(4) [[PTR_ADDR_ASCAST]], align 8
+// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[PTR_ADDR_ASCAST]], align 8
+// AMDGCNSPIRV-NEXT:    [[TMP1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[PTR_ADDR_ASCAST]], align 8
+// AMDGCNSPIRV-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(4) [[TMP1]], align 4
+// AMDGCNSPIRV-NEXT:    [[TMP3:%.*]] = atomicrmw uinc_wrap ptr addrspace(4) [[TMP0]], i32 [[TMP2]] syncscope("workgroup") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META5:![0-9]+]]
+// AMDGCNSPIRV-NEXT:    store i32 [[TMP3]], ptr addrspace(4) [[RES_ASCAST]], align 4
+// AMDGCNSPIRV-NEXT:    [[TMP4:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[PTR_ADDR_ASCAST]], align 8
+// AMDGCNSPIRV-NEXT:    [[TMP5:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[PTR_ADDR_ASCAST]], align 8
+// AMDGCNSPIRV-NEXT:    [[TMP6:%.*]] = load i32, ptr addrspace(4) [[TMP5]], align 4
+// AMDGCNSPIRV-NEXT:    [[TMP7:%.*]] = atomicrmw udec_wrap ptr addrspace(4) [[TMP4]], i32 [[TMP6]] syncscope("workgroup") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META5]]
+// AMDGCNSPIRV-NEXT:    store i32 [[TMP7]], ptr addrspace(4) [[RES_ASCAST]], align 4
+// AMDGCNSPIRV-NEXT:    ret void
 //
 __attribute__((device)) void test_non_volatile_parameter32(__UINT32_TYPE__ *ptr) {
   __UINT32_TYPE__ res;
@@ -47,6 +87,43 @@ __attribute__((device)) void test_non_volatile_parameter32(__UINT32_TYPE__ *ptr)
 // CHECK-NEXT:    [[TMP7:%.*]] = atomicrmw udec_wrap ptr [[TMP4]], i64 [[TMP6]] syncscope("workgroup") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META4]]
 // CHECK-NEXT:    store i64 [[TMP7]], ptr [[RES_ASCAST]], align 8
 // CHECK-NEXT:    ret void
+// GCN-LABEL: @_Z29test_non_volatile_parameter64Py(
+// GCN-NEXT:  entry:
+// GCN-NEXT:    [[PTR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// GCN-NEXT:    [[RES:%.*]] = alloca i64, align 8, addrspace(5)
+// GCN-NEXT:    [[PTR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[PTR_ADDR]] to ptr
+// GCN-NEXT:    [[RES_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RES]] to ptr
+// GCN-NEXT:    store ptr [[PTR:%.*]], ptr [[PTR_ADDR_ASCAST]], align 8
+// GCN-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR_ASCAST]], align 8
+// GCN-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[PTR_ADDR_ASCAST]], align 8
+// GCN-NEXT:    [[TMP2:%.*]] = load i64, ptr [[TMP1]], align 8
+// GCN-NEXT:    [[TMP3:%.*]] = atomicrmw uinc_wrap ptr [[TMP0]], i64 [[TMP2]] syncscope("workgroup") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META4]]
+// GCN-NEXT:    store i64 [[TMP3]], ptr [[RES_ASCAST]], align 8
+// GCN-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[PTR_ADDR_ASCAST]], align 8
+// GCN-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[PTR_ADDR_ASCAST]], align 8
+// GCN-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP5]], align 8
+// GCN-NEXT:    [[TMP7:%.*]] = atomicrmw udec_wrap ptr [[TMP4]], i64 [[TMP6]] syncscope("workgroup") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META4]]
+// GCN-NEXT:    store i64 [[TMP7]], ptr [[RES_ASCAST]], align 8
+// GCN-NEXT:    ret void
+//
+// AMDGCNSPIRV-LABEL: @_Z29test_non_volatile_parameter64Py(
+// AMDGCNSPIRV-NEXT:  entry:
+// AMDGCNSPIRV-NEXT:    [[PTR_ADDR:%.*]] = alloca ptr addrspace(4), align 8
+// AMDGCNSPIRV-NEXT:    [[RES:%.*]] = alloca i64, align 8
+// AMDGCNSPIRV-NEXT:    [[PTR_ADDR_ASCAST:%.*]] = addrspacecast ptr [[PTR_ADDR]] to ptr addrspace(4)
+// AMDGCNSPIRV-NEXT:    [[RES_ASCAST:%.*]] = addrspacecast ptr [[RES]] to ptr addrspace(4)
+// AMDGCNSPIRV-NEXT:    store ptr addrspace(4) [[PTR:%.*]], ptr addrspace(4) [[PTR_ADDR_ASCAST]], align 8
+// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[PTR_ADDR_ASCAST]], align 8
+// AMDGCNSPIRV-NEXT:    [[TMP1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[PTR_ADDR_ASCAST]], align 8
+// AMDGCNSPIRV-NEXT:    [[TMP2:%.*]] = load i64, ptr addrspace(4) [[TMP1]], align 8
+// AMDGCNSPIRV-NEXT:    [[TMP3:%.*]] = atomicrmw uinc_wrap ptr addrspace(4) [[TMP0]], i64 [[TMP2]] syncscope("workgroup") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META5]]
+// AMDGCNSPIRV-NEXT:    store i64 [[TMP3]], ptr addrspace(4) [[RES_ASCAST]], align 8
+// AMDGCNSPIRV-NEXT:    [[TMP4:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[PTR_ADDR_ASCAST]], align 8
+// AMDGCNSPIRV-NEXT:    [[TMP5:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[PTR_ADDR_ASCAST]], align 8
+// AMDGCNSPIRV-NEXT:    [[TMP6:%.*]] = load i64, ptr addrspace(4) [[TMP5]], align 8
+// AMDGCNSPIRV-NEXT:    [[TMP7:%.*]] = atomicrmw udec_wrap ptr addrspace(4) [[TMP4]], i64 [[TMP6]] syncscope("workgroup") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META5]]
+// AMDGCNSPIRV-NEXT:    store i64 [[TMP7]], ptr addrspace(4) [[RES_ASCAST]], align 8
+// AMDGCNSPIRV-NEXT:    ret void
 //
 __attribute__((device)) void test_non_volatile_parameter64(__UINT64_TYPE__ *ptr) {
   __UINT64_TYPE__ res;
@@ -73,6 +150,43 @@ __attribute__((device)) void test_non_volatile_parameter64(__UINT64_TYPE__ *ptr)
 // CHECK-NEXT:    [[TMP7:%.*]] = atomicrmw volatile udec_wrap ptr [[TMP4]], i32 [[TMP6]] syncscope("workgroup") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META4]]
 // CHECK-NEXT:    store i32 [[TMP7]], ptr [[RES_ASCAST]], align 4
 // CHECK-NEXT:    ret void
+// GCN-LABEL: @_Z25test_volatile_parameter32PVj(
+// GCN-NEXT:  entry:
+// GCN-NEXT:    [[PTR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// GCN-NEXT:    [[RES:%.*]] = alloca i32, align 4, addrspace(5)
+// GCN-NEXT:    [[PTR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[PTR_ADDR]] to ptr
+// GCN-NEXT:    [[RES_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RES]] to ptr
+// GCN-NEXT:    store ptr [[PTR:%.*]], ptr [[PTR_ADDR_ASCAST]], align 8
+// GCN-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR_ASCAST]], align 8
+// GCN-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[PTR_ADDR_ASCAST]], align 8
+// GCN-NEXT:    [[TMP2:%.*]] = load volatile i32, ptr [[TMP1]], align 4
+// GCN-NEXT:    [[TMP3:%.*]] = atomicrmw volatile uinc_wrap ptr [[TMP0]], i32 [[TMP2]] syncscope("workgroup") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META4]]
+// GCN-NEXT:    store i32 [[TMP3]], ptr [[RES_ASCAST]], align 4
+// GCN-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[PTR_ADDR_ASCAST]], align 8
+// GCN-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[PTR_ADDR_ASCAST]], align 8
+// GCN-NEXT:    [[TMP6:%.*]] = load volatile i32, ptr [[TMP5]], align 4
+// GCN-NEXT:    [[TMP7:%.*]] = atomicrmw volatile udec_wrap ptr [[TMP4]], i32 [[TMP6]] syncscope("workgroup") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META4]]
+// GCN-NEXT:    store i32 [[TMP7]], ptr [[RES_ASCAST]], align 4
+// GCN-NEXT:    ret void
+//
+// AMDGCNSPIRV-LABEL: @_Z25test_volatile_parameter32PVj(
+// AMDGCNSPIRV-NEXT:  entry:
+// AMDGCNSPIRV-NEXT:    [[PTR_ADDR:%.*]] = alloca ptr addrspace(4), align 8
+// AMDGCNSPIRV-NEXT:    [[RES:%.*]] = alloca i32, align 4
+// AMDGCNSPIRV-NEXT:    [[PTR_ADDR_ASCAST:%.*]] = addrspacecast ptr [[PTR_ADDR]] to ptr addrspace(4)
+// AMDGCNSPIRV-NEXT:    [[RES_ASCAST:%.*]] = addrspacecast ptr [[RES]] to ptr addrspace(4)
+// AMDGCNSPIRV-NEXT:    store ptr addrspace(4) [[PTR:%.*]], ptr addrspace(4) [[PTR_ADDR_ASCAST]], align 8
+// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[PTR_ADDR_ASCAST]], align 8
+// AMDGCNSPIRV-NEXT:    [[TMP1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[PTR_ADDR_ASCAST]], align 8
+// AMDGCNSPIRV-NEXT:    [[TMP2:%.*]] = load volatile i32, ptr addrspace(4) [[TMP1]], align 4
+// AMDGCNSPIRV-NEXT:    [[TMP3:%.*]] = atomicrmw volatile uinc_wrap ptr addrspace(4) [[TMP0]], i32 [[TMP2]] syncscope("workgroup") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META5]]
+// AMDGCNSPIRV-NEXT:    store i32 [[TMP3]], ptr addrspace(4) [[RES_ASCAST]], align 4
+// AMDGCNSPIRV-NEXT:    [[TMP4:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[PTR_ADDR_ASCAST]], align 8
+// AMDGCNSPIRV-NEXT:    [[TMP5:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[PTR_ADDR_ASCAST]], align 8
+// AMDGCNSPIRV-NEXT:    [[TMP6:%.*]] = load volatile i32, ptr addrspace(4) [[TMP5]], align 4
+// AMDGCNSPIRV-NEXT:    [[TMP7:%.*]] = atomicrmw volatile udec_wrap ptr addrspace(4) [[TMP4]], i32 [[TMP6]] syncscope("workgroup") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META5]]
+// AMDGCNSPIRV-NEXT:    store i32 [[TMP7]], ptr addrspace(4) [[RES_ASCAST]], align 4
+// AMDGCNSPIRV-NEXT:    ret void
 //
 __attribute__((device)) void test_volatile_parameter32(volatile __UINT32_TYPE__ *ptr) {
   __UINT32_TYPE__ res;
@@ -99,6 +213,43 @@ __attribute__((device)) void test_volatile_parameter32(volatile __UINT32_TYPE__
 // CHECK-NEXT:    [[TMP7:%.*]] = atomicrmw volatile udec_wrap ptr [[TMP4]], i64 [[TMP6]] syncscope("workgroup") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META4]]
 // CHECK-NEXT:    store i64 [[TMP7]], ptr [[RES_ASCAST]], align 8
 // CHECK-NEXT:    ret void
+// GCN-LABEL: @_Z25test_volatile_parameter64PVy(
+// GCN-NEXT:  entry:
+// GCN-NEXT:    [[PTR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// GCN-NEXT:    [[RES:%.*]] = alloca i64, align 8, addrspace(5)
+// GCN-NEXT:    [[PTR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[PTR_ADDR]] to ptr
+// GCN-NEXT:    [[RES_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RES]] to ptr
+// GCN-NEXT:    store ptr [[PTR:%.*]], ptr [[PTR_ADDR_ASCAST]], align 8
+// GCN-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR_ASCAST]], align 8
+// GCN-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[PTR_ADDR_ASCAST]], align 8
+// GCN-NEXT:    [[TMP2:%.*]] = load volatile i64, ptr [[TMP1]], align 8
+// GCN-NEXT:    [[TMP3:%.*]] = atomicrmw volatile uinc_wrap ptr [[TMP0]], i64 [[TMP2]] syncscope("workgroup") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META4]]
+// GCN-NEXT:    store i64 [[TMP3]], ptr [[RES_ASCAST]], align 8
+// GCN-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[PTR_ADDR_ASCAST]], align 8
+// GCN-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[PTR_ADDR_ASCAST]], align 8
+// GCN-NEXT:    [[TMP6:%.*]] = load volatile i64, ptr [[TMP5]], align 8
+// GCN-NEXT:    [[TMP7:%.*]] = atomicrmw volatile udec_wrap ptr [[TMP4]], i64 [[TMP6]] syncscope("workgroup") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META4]]
+// GCN-NEXT:    store i64 [[TMP7]], ptr [[RES_ASCAST]], align 8
+// GCN-NEXT:    ret void
+//
+// AMDGCNSPIRV-LABEL: @_Z25test_volatile_parameter64PVy(
+// AMDGCNSPIRV-NEXT:  entry:
+// AMDGCNSPIRV-NEXT:    [[PTR_ADDR:%.*]] = alloca ptr addrspace(4), align 8
+// AMDGCNSPIRV-NEXT:    [[RES:%.*]] = alloca i64, align 8
+// AMDGCNSPIRV-NEXT:    [[PTR_ADDR_ASCAST:%.*]] = addrspacecast ptr [[PTR_ADDR]] to ptr addrspace(4)
+// AMDGCNSPIRV-NEXT:    [[RES_ASCAST:%.*]] = addrspacecast ptr [[RES]] to ptr addrspace(4)
+// AMDGCNSPIRV-NEXT:    store ptr addrspace(4) [[PTR:%.*]], ptr addrspace(4) [[PTR_ADDR_ASCAST]], align 8
+// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[PTR_ADDR_ASCAST]], align 8
+// AMDGCNSPIRV-NEXT:    [[TMP1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[PTR_ADDR_ASCAST]], align 8
+// AMDGCNSPIRV-NEXT:    [[TMP2:%.*]] = load volatile i64, ptr addrspace(4) [[TMP1]], align 8
+// AMDGCNSPIRV-NEXT:    [[TMP3:%.*]] = atomicrmw volatile uinc_wrap ptr addrspace(4) [[TMP0]], i64 [[TMP2]] syncscope("workgroup") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META5]]
+// AMDGCNSPIRV-NEXT:    store i64 [[TMP3]], ptr addrspace(4) [[RES_ASCAST]], align 8
+// AMDGCNSPIRV-NEXT:    [[TMP4:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[PTR_ADDR_ASCAST]], align 8
+// AMDGCNSPIRV-NEXT:    [[TMP5:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[PTR_ADDR_ASCAST]], align 8
+// AMDGCNSPIRV-NEXT:    [[TMP6:%.*]] = load volatile i64, ptr addrspace(4) [[TMP5]], align 8
+// AMDGCNSPIRV-NEXT:    [[TMP7:%.*]] = atomicrmw volatile udec_wrap ptr addrspace(4) [[TMP4]], i64 [[TMP6]] syncscope("workgroup") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META5]]
+// AMDGCNSPIRV-NEXT:    store i64 [[TMP7]], ptr addrspace(4) [[RES_ASCAST]], align 8
+// AMDGCNSPIRV-NEXT:    ret void
 //
 __attribute__((device)) void test_volatile_parameter64(volatile __UINT64_TYPE__ *ptr) {
   __UINT64_TYPE__ res;
@@ -116,6 +267,25 @@ __attribute__((device)) void test_volatile_parameter64(volatile __UINT64_TYPE__
 // CHECK-NEXT:    [[TMP3:%.*]] = atomicrmw udec_wrap ptr addrspacecast (ptr addrspace(3) @_ZZ13test_shared32vE3val to ptr), i32 [[TMP2]] syncscope("workgroup") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META4]]
 // CHECK-NEXT:    store i32 [[TMP3]], ptr addrspacecast (ptr addrspace(3) @_ZZ13test_shared32vE3val to ptr), align 4
 // CHECK-NEXT:    ret void
+// GCN-LABEL: @_Z13test_shared32v(
+// GCN-NEXT:  entry:
+// GCN-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspacecast (ptr addrspace(3) @_ZZ13test_shared32vE3val to ptr), align 4
+// GCN-NEXT:    [[TMP1:%.*]] = atomicrmw uinc_wrap ptr addrspacecast (ptr addrspace(3) @_ZZ13test_shared32vE3val to ptr), i32 [[TMP0]] syncscope("workgroup") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META4]]
+// GCN-NEXT:    store i32 [[TMP1]], ptr addrspacecast (ptr addrspace(3) @_ZZ13test_shared32vE3val to ptr), align 4
+// GCN-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspacecast (ptr addrspace(3) @_ZZ13test_shared32vE3val to ptr), align 4
+// GCN-NEXT:    [[TMP3:%.*]] = atomicrmw udec_wrap ptr addrspacecast (ptr addrspace(3) @_ZZ13test_shared32vE3val to ptr), i32 [[TMP2]] syncscope("workgroup") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META4]]
+// GCN-NEXT:    store i32 [[TMP3]], ptr addrspacecast (ptr addrspace(3) @_ZZ13test_shared32vE3val to ptr), align 4
+// GCN-NEXT:    ret void
+//
+// AMDGCNSPIRV-LABEL: @_Z13test_shared32v(
+// AMDGCNSPIRV-NEXT:  entry:
+// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(4) addrspacecast (ptr addrspace(3) @_ZZ13test_shared32vE3val to ptr addrspace(4)), align 4
+// AMDGCNSPIRV-NEXT:    [[TMP1:%.*]] = atomicrmw uinc_wrap ptr addrspace(4) addrspacecast (ptr addrspace(3) @_ZZ13test_shared32vE3val to ptr addrspace(4)), i32 [[TMP0]] syncscope("workgroup") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META5]]
+// AMDGCNSPIRV-NEXT:    store i32 [[TMP1]], ptr addrspace(4) addrspacecast (ptr addrspace(3) @_ZZ13test_shared32vE3val to ptr addrspace(4)), align 4
+// AMDGCNSPIRV-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(4) addrspacecast (ptr addrspace(3) @_ZZ13test_shared32vE3val to ptr addrspace(4)), align 4
+// AMDGCNSPIRV-NEXT:    [[TMP3:%.*]] = atomicrmw udec_wrap ptr addrspace(4) addrspacecast (ptr addrspace(3) @_ZZ13test_shared32vE3val to ptr addrspace(4)), i32 [[TMP2]] syncscope("workgroup") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META5]]
+// AMDGCNSPIRV-NEXT:    store i32 [[TMP3]], ptr addrspace(4) addrspacecast (ptr addrspace(3) @_ZZ13test_shared32vE3val to ptr addrspace(4)), align 4
+// AMDGCNSPIRV-NEXT:    ret void
 //
 __attribute__((device)) void test_shared32() {
   __attribute__((shared)) __UINT32_TYPE__ val;
@@ -134,6 +304,25 @@ __attribute__((device)) void test_shared32() {
 // CHECK-NEXT:    [[TMP3:%.*]] = atomicrmw udec_wrap ptr addrspacecast (ptr addrspace(3) @_ZZ13test_shared64vE3val to ptr), i64 [[TMP2]] syncscope("workgroup") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META4]]
 // CHECK-NEXT:    store i64 [[TMP3]], ptr addrspacecast (ptr addrspace(3) @_ZZ13test_shared64vE3val to ptr), align 8
 // CHECK-NEXT:    ret void
+// GCN-LABEL: @_Z13test_shared64v(
+// GCN-NEXT:  entry:
+// GCN-NEXT:    [[TMP0:%.*]] = load i64, ptr addrspacecast (ptr addrspace(3) @_ZZ13test_shared64vE3val to ptr), align 8
+// GCN-NEXT:    [[TMP1:%.*]] = atomicrmw uinc_wrap ptr addrspacecast (ptr addrspace(3) @_ZZ13test_shared64vE3val to ptr), i64 [[TMP0]] syncscope("workgroup") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META4]]
+// GCN-NEXT:    store i64 [[TMP1]], ptr addrspacecast (ptr addrspace(3) @_ZZ13test_shared64vE3val to ptr), align 8
+// GCN-NEXT:    [[TMP2:%.*]] = load i64, ptr addrspacecast (ptr addrspace(3) @_ZZ13test_shared64vE3val to ptr), align 8
+// GCN-NEXT:    [[TMP3:%.*]] = atomicrmw udec_wrap ptr addrspacecast (ptr addrspace(3) @_ZZ13test_shared64vE3val to ptr), i64 [[TMP2]] syncscope("workgroup") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META4]]
+// GCN-NEXT:    store i64 [[TMP3]], ptr addrspacecast (ptr addrspace(3) @_ZZ13test_shared64vE3val to ptr), align 8
+// GCN-NEXT:    ret void
+//
+// AMDGCNSPIRV-LABEL: @_Z13test_shared64v(
+// AMDGCNSPIRV-NEXT:  entry:
+// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = load i64, ptr addrspace(4) addrspacecast (ptr addrspace(3) @_ZZ13test_shared64vE3val to ptr addrspace(4)), align 8
+// AMDGCNSPIRV-NEXT:    [[TMP1:%.*]] = atomicrmw uinc_wrap ptr addrspace(4) addrspacecast (ptr addrspace(3) @_ZZ13test_shared64vE3val to ptr addrspace(4)), i64 [[TMP0]] syncscope("workgroup") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META5]]
+// AMDGCNSPIRV-NEXT:    store i64 [[TMP1]], ptr addrspace(4) addrspacecast (ptr addrspace(3) @_ZZ13test_shared64vE3val to ptr addrspace(4)), align 8
+// AMDGCNSPIRV-NEXT:    [[TMP2:%.*]] = load i64, ptr addrspace(4) addrspacecast (ptr addrspace(3) @_ZZ13test_shared64vE3val to ptr addrspace(4)), align 8
+// AMDGCNSPIRV-NEXT:    [[TMP3:%.*]] = atomicrmw udec_wrap ptr addrspace(4) addrspacecast (ptr addrspace(3) @_ZZ13test_shared64vE3val to ptr addrspace(4)), i64 [[TMP2]] syncscope("workgroup") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META5]]
+// AMDGCNSPIRV-NEXT:    store i64 [[TMP3]], ptr addrspace(4) addrspacecast (ptr addrspace(3) @_ZZ13test_shared64vE3val to ptr addrspace(4)), align 8
+// AMDGCNSPIRV-NEXT:    ret void
 //
 __attribute__((device)) void test_shared64() {
   __attribute__((shared)) __UINT64_TYPE__ val;
@@ -153,6 +342,25 @@ __attribute__((device)) __UINT32_TYPE__ global_val32;
 // CHECK-NEXT:    [[TMP3:%.*]] = atomicrmw udec_wrap ptr addrspacecast (ptr addrspace(1) @global_val32 to ptr), i32 [[TMP2]] syncscope("workgroup") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META4]]
 // CHECK-NEXT:    store i32 [[TMP3]], ptr addrspacecast (ptr addrspace(1) @global_val32 to ptr), align 4
 // CHECK-NEXT:    ret void
+// GCN-LABEL: @_Z13test_global32v(
+// GCN-NEXT:  entry:
+// GCN-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspacecast (ptr addrspace(1) @global_val32 to ptr), align 4
+// GCN-NEXT:    [[TMP1:%.*]] = atomicrmw uinc_wrap ptr addrspacecast (ptr addrspace(1) @global_val32 to ptr), i32 [[TMP0]] syncscope("workgroup") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META4]]
+// GCN-NEXT:    store i32 [[TMP1]], ptr addrspacecast (ptr addrspace(1) @global_val32 to ptr), align 4
+// GCN-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspacecast (ptr addrspace(1) @global_val32 to ptr), align 4
+// GCN-NEXT:    [[TMP3:%.*]] = atomicrmw udec_wrap ptr addrspacecast (ptr addrspace(1) @global_val32 to ptr), i32 [[TMP2]] syncscope("workgroup") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META4]]
+// GCN-NEXT:    store i32 [[TMP3]], ptr addrspacecast (ptr addrspace(1) @global_val32 to ptr), align 4
+// GCN-NEXT:    ret void
+//
+// AMDGCNSPIRV-LABEL: @_Z13test_global32v(
+// AMDGCNSPIRV-NEXT:  entry:
+// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(4) addrspacecast (ptr addrspace(1) @global_val32 to ptr addrspace(4)), align 4
+// AMDGCNSPIRV-NEXT:    [[TMP1:%.*]] = atomicrmw uinc_wrap ptr addrspace(4) addrspacecast (ptr addrspace(1) @global_val32 to ptr addrspace(4)), i32 [[TMP0]] syncscope("workgroup") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META5]]
+// AMDGCNSPIRV-NEXT:    store i32 [[TMP1]], ptr addrspace(4) addrspacecast (ptr addrspace(1) @global_val32 to ptr addrspace(4)), align 4
+// AMDGCNSPIRV-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(4) addrspacecast (ptr addrspace(1) @global_val32 to ptr addrspace(4)), align 4
+// AMDGCNSPIRV-NEXT:    [[TMP3:%.*]] = atomicrmw udec_wrap ptr addrspace(4) addrspacecast (ptr addrspace(1) @global_val32 to ptr addrspace(4)), i32 [[TMP2]] syncscope("workgroup") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META5]]
+// AMDGCNSPIRV-NEXT:    store i32 [[TMP3]], ptr addrspace(4) addrspacecast (ptr addrspace(1) @global_val32 to ptr addrspace(4)), align 4
+// AMDGCNSPIRV-NEXT:    ret void
 //
 __attribute__((device)) void test_global32() {
   global_val32 = __builtin_amdgcn_atomic_inc32(&global_val32, global_val32, __ATOMIC_SEQ_CST, "workgroup");
@@ -170,6 +378,25 @@ __attribute__((device)) __UINT64_TYPE__ global_val64;
 // CHECK-NEXT:    [[TMP3:%.*]] = atomicrmw udec_wrap ptr addrspacecast (ptr addrspace(1) @global_val64 to ptr), i64 [[TMP2]] syncscope("workgroup") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META4]]
 // CHECK-NEXT:    store i64 [[TMP3]], ptr addrspacecast (ptr addrspace(1) @global_val64 to ptr), align 8
 // CHECK-NEXT:    ret void
+// GCN-LABEL: @_Z13test_global64v(
+// GCN-NEXT:  entry:
+// GCN-NEXT:    [[TMP0:%.*]] = load i64, ptr addrspacecast (ptr addrspace(1) @global_val64 to ptr), align 8
+// GCN-NEXT:    [[TMP1:%.*]] = atomicrmw uinc_wrap ptr addrspacecast (ptr addrspace(1) @global_val64 to ptr), i64 [[TMP0]] syncscope("workgroup") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META4]]
+// GCN-NEXT:    store i64 [[TMP1]], ptr addrspacecast (ptr addrspace(1) @global_val64 to ptr), align 8
+// GCN-NEXT:    [[TMP2:%.*]] = load i64, ptr addrspacecast (ptr addrspace(1) @global_val64 to ptr), align 8
+// GCN-NEXT:    [[TMP3:%.*]] = atomicrmw udec_wrap ptr addrspacecast (ptr addrspace(1) @global_val64 to ptr), i64 [[TMP2]] syncscope("workgroup") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META4]]
+// GCN-NEXT:    store i64 [[TMP3]], ptr addrspacecast (ptr addrspace(1) @global_val64 to ptr), align 8
+// GCN-NEXT:    ret void
+//
+// AMDGCNSPIRV-LABEL: @_Z13test_global64v(
+// AMDGCNSPIRV-NEXT:  entry:
+// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = load i64, ptr addrspace(4) addrspacecast (ptr addrspace(1) @global_val64 to ptr addrspace(4)), align 8
+// AMDGCNSPIRV-NEXT:    [[TMP1:%.*]] = atomicrmw uinc_wrap ptr addrspace(4) addrspacecast (ptr addrspace(1) @global_val64 to ptr addrspace(4)), i64 [[TMP0]] syncscope("workgroup") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META5]]
+// AMDGCNSPIRV-NEXT:    store i64 [[TMP1]], ptr addrspace(4) addrspacecast (ptr addrspace(1) @global_val64 to ptr addrspace(4)), align 8
+// AMDGCNSPIRV-NEXT:    [[TMP2:%.*]] = load i64, ptr addrspace(4) addrspacecast (ptr addrspace(1) @global_val64 to ptr addrspace(4)), align 8
+// AMDGCNSPIRV-NEXT:    [[TMP3:%.*]] = atomicrmw udec_wrap ptr addrspace(4) addrspacecast (ptr addrspace(1) @global_val64 to ptr addrspace(4)), i64 [[TMP2]] syncscope("workgroup") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META5]]
+// AMDGCNSPIRV-NEXT:    store i64 [[TMP3]], ptr addrspace(4) addrspacecast (ptr addrspace(1) @global_val64 to ptr addrspace(4)), align 8
+// AMDGCNSPIRV-NEXT:    ret void
 //
 __attribute__((device)) void test_global64() {
   global_val64 = __builtin_amdgcn_atomic_inc64(&global_val64, global_val64, __ATOMIC_SEQ_CST, "workgroup");
@@ -189,6 +416,29 @@ __attribute__((constant)) __UINT32_TYPE__ cval32;
 // CHECK-NEXT:    [[TMP3:%.*]] = atomicrmw udec_wrap ptr addrspacecast (ptr addrspace(4) @cval32 to ptr), i32 [[TMP2]] syncscope("workgroup") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META4]]
 // CHECK-NEXT:    store i32 [[TMP3]], ptr [[LOCAL_VAL_ASCAST]], align 4
 // CHECK-NEXT:    ret void
+// GCN-LABEL: @_Z15test_constant32v(
+// GCN-NEXT:  entry:
+// GCN-NEXT:    [[LOCAL_VAL:%.*]] = alloca i32, align 4, addrspace(5)
+// GCN-NEXT:    [[LOCAL_VAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[LOCAL_VAL]] to ptr
+// GCN-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspacecast (ptr addrspace(4) @cval32 to ptr), align 4
+// GCN-NEXT:    [[TMP1:%.*]] = atomicrmw uinc_wrap ptr addrspacecast (ptr addrspace(4) @cval32 to ptr), i32 [[TMP0]] syncscope("workgroup") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META4]]
+// GCN-NEXT:    store i32 [[TMP1]], ptr [[LOCAL_VAL_ASCAST]], align 4
+// GCN-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspacecast (ptr addrspace(4) @cval32 to ptr), align 4
+// GCN-NEXT:    [[TMP3:%.*]] = atomicrmw udec_wrap ptr addrspacecast (ptr addrspace(4) @cval32 to ptr), i32 [[TMP2]] syncscope("workgroup") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META4]]
+// GCN-NEXT:    store i32 [[TMP3]], ptr [[LOCAL_VAL_ASCAST]], align 4
+// GCN-NEXT:    ret void
+//
+// AMDGCNSPIRV-LABEL: @_Z15test_constant32v(
+// AMDGCNSPIRV-NEXT:  entry:
+// AMDGCNSPIRV-NEXT:    [[LOCAL_VAL:%.*]] = alloca i32, align 4
+// AMDGCNSPIRV-NEXT:    [[LOCAL_VAL_ASCAST:%.*]] = addrspacecast ptr [[LOCAL_VAL]] to ptr addrspace(4)
+// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(4) addrspacecast (ptr addrspace(1) @cval32 to ptr addrspace(4)), align 4
+// AMDGCNSPIRV-NEXT:    [[TMP1:%.*]] = atomicrmw uinc_wrap ptr addrspace(4) addrspacecast (ptr addrspace(1) @cval32 to ptr addrspace(4)), i32 [[TMP0]] syncscope("workgroup") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META5]]
+// AMDGCNSPIRV-NEXT:    store i32 [[TMP1]], ptr addrspace(4) [[LOCAL_VAL_ASCAST]], align 4
+// AMDGCNSPIRV-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(4) addrspacecast (ptr addrspace(1) @cval32 to ptr addrspace(4)), align 4
+// AMDGCNSPIRV-NEXT:    [[TMP3:%.*]] = atomicrmw udec_wrap ptr addrspace(4) addrspacecast (ptr addrspace(1) @cval32 to ptr addrspace(4)), i32 [[TMP2]] syncscope("workgroup") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META5]]
+// AMDGCNSPIRV-NEXT:    store i32 [[TMP3]], ptr addrspace(4) [[LOCAL_VAL_ASCAST]], align 4
+// AMDGCNSPIRV-NEXT:    ret void
 //
 __attribute__((device)) void test_constant32() {
   __UINT32_TYPE__ local_val;
@@ -210,6 +460,29 @@ __attribute__((constant)) __UINT64_TYPE__ cval64;
 // CHECK-NEXT:    [[TMP3:%.*]] = atomicrmw udec_wrap ptr addrspacecast (ptr addrspace(4) @cval64 to ptr), i64 [[TMP2]] syncscope("workgroup") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META4]]
 // CHECK-NEXT:    store i64 [[TMP3]], ptr [[LOCAL_VAL_ASCAST]], align 8
 // CHECK-NEXT:    ret void
+// GCN-LABEL: @_Z15test_constant64v(
+// GCN-NEXT:  entry:
+// GCN-NEXT:    [[LOCAL_VAL:%.*]] = alloca i64, align 8, addrspace(5)
+// GCN-NEXT:    [[LOCAL_VAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[LOCAL_VAL]] to ptr
+// GCN-NEXT:    [[TMP0:%.*]] = load i64, ptr addrspacecast (ptr addrspace(4) @cval64 to ptr), align 8
+// GCN-NEXT:    [[TMP1:%.*]] = atomicrmw uinc_wrap ptr addrspacecast (ptr addrspace(4) @cval64 to ptr), i64 [[TMP0]] syncscope("workgroup") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META4]]
+// GCN-NEXT:    store i64 [[TMP1]], ptr [[LOCAL_VAL_ASCAST]], align 8
+// GCN-NEXT:    [[TMP2:%.*]] = load i64, ptr addrspacecast (ptr addrspace(4) @cval64 to ptr), align 8
+// GCN-NEXT:    [[TMP3:%.*]] = atomicrmw udec_wrap ptr addrspacecast (ptr addrspace(4) @cval64 to ptr), i64 [[TMP2]] syncscope("workgroup") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META4]]
+// GCN-NEXT:    store i64 [[TMP3]], ptr [[LOCAL_VAL_ASCAST]], align 8
+// GCN-NEXT:    ret void
+//
+// AMDGCNSPIRV-LABEL: @_Z15test_constant64v(
+// AMDGCNSPIRV-NEXT:  entry:
+// AMDGCNSPIRV-NEXT:    [[LOCAL_VAL:%.*]] = alloca i64, align 8
+// AMDGCNSPIRV-NEXT:    [[LOCAL_VAL_ASCAST:%.*]] = addrspacecast ptr [[LOCAL_VAL]] to ptr addrspace(4)
+// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = load i64, ptr addrspace(4) addrspacecast (ptr addrspace(1) @cval64 to ptr addrspace(4)), align 8
+// AMDGCNSPIRV-NEXT:    [[TMP1:%.*]] = atomicrmw uinc_wrap ptr addrspace(4) addrspacecast (ptr addrspace(1) @cval64 to ptr addrspace(4)), i64 [[TMP0]] syncscope("workgroup") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META5]]
+// AMDGCNSPIRV-NEXT:    store i64 [[TMP1]], ptr addrspace(4) [[LOCAL_VAL_ASCAST]], align 8
+// AMDGCNSPIRV-NEXT:    [[TMP2:%.*]] = load i64, ptr addrspace(4) addrspacecast (ptr addrspace(1) @cval64 to ptr addrspace(4)), align 8
+// AMDGCNSPIRV-NEXT:    [[TMP3:%.*]] = atomicrmw udec_wrap ptr addrspace(4) addrspacecast (ptr addrspace(1) @cval64 to ptr addrspace(4)), i64 [[TMP2]] syncscope("workgroup") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META5]]
+// AMDGCNSPIRV-NEXT:    store i64 [[TMP3]], ptr addrspace(4) [[LOCAL_VAL_ASCAST]], align 8
+// AMDGCNSPIRV-NEXT:    ret void
 //
 __attribute__((device)) void test_constant64() {
   __UINT64_TYPE__ local_val;
@@ -240,6 +513,49 @@ __attribute__((device)) void test_constant64() {
 // CHECK-NEXT:    [[TMP11:%.*]] = atomicrmw udec_wrap ptr addrspacecast (ptr addrspace(3) @_ZZ12test_order32vE3val to ptr), i32 [[TMP10]] syncscope("workgroup") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META4]]
 // CHECK-NEXT:    store i32 [[TMP11]], ptr addrspacecast (ptr addrspace(3) @_ZZ12test_order32vE3val to ptr), align 4
 // CHECK-NEXT:    ret void
+// GCN-LABEL: @_Z12test_order32v(
+// GCN-NEXT:  entry:
+// GCN-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspacecast (ptr addrspace(3) @_ZZ12test_order32vE3val to ptr), align 4
+// GCN-NEXT:    [[TMP1:%.*]] = atomicrmw uinc_wrap ptr addrspacecast (ptr addrspace(3) @_ZZ12test_order32vE3val to ptr), i32 [[TMP0]] syncscope("workgroup") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]]
+// GCN-NEXT:    store i32 [[TMP1]], ptr addrspacecast (ptr addrspace(3) @_ZZ12test_order32vE3val to ptr), align 4
+// GCN-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspacecast (ptr addrspace(3) @_ZZ12test_order32vE3val to ptr), align 4
+// GCN-NEXT:    [[TMP3:%.*]] = atomicrmw uinc_wrap ptr addrspacecast (ptr addrspace(3) @_ZZ12test_order32vE3val to ptr), i32 [[TMP2]] syncscope("workgroup") acquire, align 4, !amdgpu.no.fine.grained.memory [[META4]]
+// GCN-NEXT:    store i32 [[TMP3]], ptr addrspacecast (ptr addrspace(3) @_ZZ12test_order32vE3val to ptr), align 4
+// GCN-NEXT:    [[TMP4:%.*]] = load i32, ptr addrspacecast (ptr addrspace(3) @_ZZ12test_order32vE3val to ptr), align 4
+// GCN-NEXT:    [[TMP5:%.*]] = atomicrmw uinc_wrap ptr addrspacecast (ptr addrspace(3) @_ZZ12test_order32vE3val to ptr), i32 [[TMP4]] syncscope("workgroup") acquire, align 4, !amdgpu.no.fine.grained.memory [[META4]]
+// GCN-NEXT:    store i32 [[TMP5]], ptr addrspacecast (ptr addrspace(3) @_ZZ12test_order32vE3val to ptr), align 4
+// GCN-NEXT:    [[TMP6:%.*]] = load i32, ptr addrspacecast (ptr addrspace(3) @_ZZ12test_order32vE3val to ptr), align 4
+// GCN-NEXT:    [[TMP7:%.*]] = atomicrmw udec_wrap ptr addrspacecast (ptr addrspace(3) @_ZZ12test_order32vE3val to ptr), i32 [[TMP6]] syncscope("workgroup") release, align 4, !amdgpu.no.fine.grained.memory [[META4]]
+// GCN-NEXT:    store i32 [[TMP7]], ptr addrspacecast (ptr addrspace(3) @_ZZ12test_order32vE3val to ptr), align 4
+// GCN-NEXT:    [[TMP8:%.*]] = load i32, ptr addrspacecast (ptr addrspace(3) @_ZZ12test_order32vE3val to ptr), align 4
+// GCN-NEXT:    [[TMP9:%.*]] = atomicrmw udec_wrap ptr addrspacecast (ptr addrspace(3) @_ZZ12test_order32vE3val to ptr), i32 [[TMP8]] syncscope("workgroup") acq_rel, align 4, !amdgpu.no.fine.grained.memory [[META4]]
+// GCN-NEXT:    store i32 [[TMP9]], ptr addrspacecast (ptr addrspace(3) @_ZZ12test_order32vE3val to ptr), align 4
+// GCN-NEXT:    [[TMP10:%.*]] = load i32, ptr addrspacecast (ptr addrspace(3) @_ZZ12test_order32vE3val to ptr), align 4
+// GCN-NEXT:    [[TMP11:%.*]] = atomicrmw udec_wrap ptr addrspacecast (ptr addrspace(3) @_ZZ12test_order32vE3val to ptr), i32 [[TMP10]] syncscope("workgroup") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META4]]
+// GCN-NEXT:    store i32 [[TMP11]], ptr addrspacecast (ptr addrspace(3) @_ZZ12test_order32vE3val to ptr), align 4
+// GCN-NEXT:    ret void
+//
+// AMDGCNSPIRV-LABEL: @_Z12test_order32v(
+// AMDGCNSPIRV-NEXT:  entry:
+// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(4) addrspacecast (ptr addrspace(3) @_ZZ12test_order32vE3val to ptr addrspace(4)), align 4
+// AMDGCNSPIRV-NEXT:    [[TMP1:%.*]] = atomicrmw uinc_wrap ptr addrspace(4) addrspacecast (ptr addrspace(3) @_ZZ12test_order32vE3val to ptr addrspace(4)), i32 [[TMP0]] syncscope("workgroup") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META5]]
+// AMDGCNSPIRV-NEXT:    store i32 [[TMP1]], ptr addrspace(4) addrspacecast (ptr addrspace(3) @_ZZ12test_order32vE3val to ptr addrspace(4)), align 4
+// AMDGCNSPIRV-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(4) addrspacecast (ptr addrspace(3) @_ZZ12test_order32vE3val to ptr addrspace(4)), align 4
+// AMDGCNSPIRV-NEXT:    [[TMP3:%.*]] = atomicrmw uinc_wrap ptr addrspace(4) addrspacecast (ptr addrspace(3) @_ZZ12test_order32vE3val to ptr addrspace(4)), i32 [[TMP2]] syncscope("workgroup") acquire, align 4, !amdgpu.no.fine.grained.memory [[META5]]
+// AMDGCNSPIRV-NEXT:    store i32 [[TMP3]], ptr addrspace(4) addrspacecast (ptr addrspace(3) @_ZZ12test_order32vE3val to ptr addrspace(4)), align 4
+// AMDGCNSPIRV-NEXT:    [[TMP4:%.*]] = load i32, ptr addrspace(4) addrspacecast (ptr addrspace(3) @_ZZ12test_order32vE3val to ptr addrspace(4)), align 4
+// AMDGCNSPIRV-NEXT:    [[TMP5:%.*]] = atomicrmw uinc_wrap ptr addrspace(4) addrspacecast (ptr addrspace(3) @_ZZ12test_order32vE3val to ptr addrspace(4)), i32 [[TMP4]] syncscope("workgroup") acquire, align 4, !amdgpu.no.fine.grained.memory [[META5]]
+// AMDGCNSPIRV-NEXT:    store i32 [[TMP5]], ptr addrspace(4) addrspacecast (ptr addrspace(3) @_ZZ12test_order32vE3val to ptr addrspace(4)), align 4
+// AMDGCNSPIRV-NEXT:    [[TMP6:%.*]] = load i32, ptr addrspace(4) addrspacecast (ptr addrspace(3) @_ZZ12test_order32vE3val to ptr addrspace(4)), align 4
+// AMDGCNSPIRV-NEXT:    [[TMP7:%.*]] = atomicrmw udec_wrap ptr addrspace(4) addrspacecast (ptr addrspace(3) @_ZZ12test_order32vE3val to ptr addrspace(4)), i32 [[TMP6]] syncscope("workgroup") release, align 4, !amdgpu.no.fine.grained.memory [[META5]]
+// AMDGCNSPIRV-NEXT:    store i32 [[TMP7]], ptr addrspace(4) addrspacecast (ptr addrspace(3) @_ZZ12test_order32vE3val to ptr addrspace(4)), align 4
+// AMDGCNSPIRV-NEXT:    [[TMP8:%.*]] = load i32, ptr addrspace(4) addrspacecast (ptr addrspace(3) @_ZZ12test_order32vE3val to ptr addrspace(4)), align 4
+// AMDGCNSPIRV-NEXT:    [[TMP9:%.*]] = atomicrmw udec_wrap ptr addrspace(4) addrspacecast (ptr addrspace(3) @_ZZ12test_order32vE3val to ptr addrspace(4)), i32 [[TMP8]] syncscope("workgroup") acq_rel, align 4, !amdgpu.no.fine.grained.memory [[META5]]
+// AMDGCNSPIRV-NEXT:    store i32 [[TMP9]], ptr addrspace(4) addrspacecast (ptr addrspace(3) @_ZZ12test_order32vE3val to ptr addrspace(4)), align 4
+// AMDGCNSPIRV-NEXT:    [[TMP10:%.*]] = load i32, ptr addrspace(4) addrspacecast (ptr addrspace(3) @_ZZ12test_order32vE3val to ptr addrspace(4)), align 4
+// AMDGCNSPIRV-NEXT:    [[TMP11:%.*]] = atomicrmw udec_wrap ptr addrspace(4) addrspacecast (ptr addrspace(3) @_ZZ12test_order32vE3val to ptr addrspace(4)), i32 [[TMP10]] syncscope("workgroup") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META5]]
+// AMDGCNSPIRV-NEXT:    store i32 [[TMP11]], ptr addrspace(4) addrspacecast (ptr addrspace(3) @_ZZ12test_order32vE3val to ptr addrspace(4)), align 4
+// AMDGCNSPIRV-NEXT:    ret void
 //
 __attribute__((device)) void test_order32() {
   __attribute__((shared)) __UINT32_TYPE__ val;
@@ -278,6 +594,49 @@ __attribute__((device)) void test_order32() {
 // CHECK-NEXT:    [[TMP11:%.*]] = atomicrmw udec_wrap ptr addrspacecast (ptr addrspace(3) @_ZZ12test_order64vE3val to ptr), i64 [[TMP10]] syncscope("workgroup") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META4]]
 // CHECK-NEXT:    store i64 [[TMP11]], ptr addrspacecast (ptr addrspace(3) @_ZZ12test_order64vE3val to ptr), align 8
 // CHECK-NEXT:    ret void
+// GCN-LABEL: @_Z12test_order64v(
+// GCN-NEXT:  entry:
+// GCN-NEXT:    [[TMP0:%.*]] = load i64, ptr addrspacecast (ptr addrspace(3) @_ZZ12test_order64vE3val to ptr), align 8
+// GCN-NEXT:    [[TMP1:%.*]] = atomicrmw uinc_wrap ptr addrspacecast (ptr addrspace(3) @_ZZ12test_order64vE3val to ptr), i64 [[TMP0]] syncscope("workgroup") monotonic, align 8, !amdgpu.no.fine.grained.memory [[META4]]
+// GCN-NEXT:    store i64 [[TMP1]], ptr addrspacecast (ptr addrspace(3) @_ZZ12test_order64vE3val to ptr), align 8
+// GCN-NEXT:    [[TMP2:%.*]] = load i64, ptr addrspacecast (ptr addrspace(3) @_ZZ12test_order64vE3val to ptr), align 8
+// GCN-NEXT:    [[TMP3:%.*]] = atomicrmw udec_wrap ptr addrspacecast (ptr addrspace(3) @_ZZ12test_order64vE3val to ptr), i64 [[TMP2]] syncscope("workgroup") acquire, align 8, !amdgpu.no.fine.grained.memory [[META4]]
+// GCN-NEXT:    store i64 [[TMP3]], ptr addrspacecast (ptr addrspace(3) @_ZZ12test_order64vE3val to ptr), align 8
+// GCN-NEXT:    [[TMP4:%.*]] = load i64, ptr addrspacecast (ptr addrspace(3) @_ZZ12test_order64vE3val to ptr), align 8
+// GCN-NEXT:    [[TMP5:%.*]] = atomicrmw uinc_wrap ptr addrspacecast (ptr addrspace(3) @_ZZ12test_order64vE3val to ptr), i64 [[TMP4]] syncscope("workgroup") acquire, align 8, !amdgpu.no.fine.grained.memory [[META4]]
+// GCN-NEXT:    store i64 [[TMP5]], ptr addrspacecast (ptr addrspace(3) @_ZZ12test_order64vE3val to ptr), align 8
+// GCN-NEXT:    [[TMP6:%.*]] = load i64, ptr addrspacecast (ptr addrspace(3) @_ZZ12test_order64vE3val to ptr), align 8
+// GCN-NEXT:    [[TMP7:%.*]] = atomicrmw udec_wrap ptr addrspacecast (ptr addrspace(3) @_ZZ12test_order64vE3val to ptr), i64 [[TMP6]] syncscope("workgroup") release, align 8, !amdgpu.no.fine.grained.memory [[META4]]
+// GCN-NEXT:    store i64 [[TMP7]], ptr addrspacecast (ptr addrspace(3) @_ZZ12test_order64vE3val to ptr), align 8
+// GCN-NEXT:    [[TMP8:%.*]] = load i64, ptr addrspacecast (ptr addrspace(3) @_ZZ12test_order64vE3val to ptr), align 8
+// GCN-NEXT:    [[TMP9:%.*]] = atomicrmw udec_wrap ptr addrspacecast (ptr addrspace(3) @_ZZ12test_order64vE3val to ptr), i64 [[TMP8]] syncscope("workgroup") acq_rel, align 8, !amdgpu.no.fine.grained.memory [[META4]]
+// GCN-NEXT:    store i64 [[TMP9]], ptr addrspacecast (ptr addrspace(3) @_ZZ12test_order64vE3val to ptr), align 8
+// GCN-NEXT:    [[TMP10:%.*]] = load i64, ptr addrspacecast (ptr addrspace(3) @_ZZ12test_order64vE3val to ptr), align 8
+// GCN-NEXT:    [[TMP11:%.*]] = atomicrmw udec_wrap ptr addrspacecast (ptr addrspace(3) @_ZZ12test_order64vE3val to ptr), i64 [[TMP10]] syncscope("workgroup") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META4]]
+// GCN-NEXT:    store i64 [[TMP11]], ptr addrspacecast (ptr addrspace(3) @_ZZ12test_order64vE3val to ptr), align 8
+// GCN-NEXT:    ret void
+//
+// AMDGCNSPIRV-LABEL: @_Z12test_order64v(
+// AMDGCNSPIRV-NEXT:  entry:
+// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = load i64, ptr addrspace(4) addrspacecast (ptr addrspace(3) @_ZZ12test_order64vE3val to ptr addrspace(4)), align 8
+// AMDGCNSPIRV-NEXT:    [[TMP1:%.*]] = atomicrmw uinc_wrap ptr addrspace(4) addrspacecast (ptr addrspace(3) @_ZZ12test_order64vE3val to ptr addrspace(4)), i64 [[TMP0]] syncscope("workgroup") monotonic, align 8, !amdgpu.no.fine.grained.memory [[META5]]
+// AMDGCNSPIRV-NEXT:    store i64 [[TMP1]], ptr addrspace(4) addrspacecast (ptr addrspace(3) @_ZZ12test_order64vE3val to ptr addrspace(4)), align 8
+// AMDGCNSPIRV-NEXT:    [[TMP2:%.*]] = load i64, ptr addrspace(4) addrspacecast (ptr addrspace(3) @_ZZ12test_order64vE3val to ptr addrspace(4)), align 8
+// AMDGCNSPIRV-NEXT:    [[TMP3:%.*]] = atomicrmw udec_wrap ptr addrspace(4) addrspacecast (ptr addrspace(3) @_ZZ12test_order64vE3val to ptr addrspace(4)), i64 [[TMP2]] syncscope("workgroup") acquire, align 8, !amdgpu.no.fine.grained.memory [[META5]]
+// AMDGCNSPIRV-NEXT:    store i64 [[TMP3]], ptr addrspace(4) addrspacecast (ptr addrspace(3) @_ZZ12test_order64vE3val to ptr addrspace(4)), align 8
+// AMDGCNSPIRV-NEXT:    [[TMP4:%.*]] = load i64, ptr addrspace(4) addrspacecast (ptr addrspace(3) @_ZZ12test_order64vE3val to ptr addrspace(4)), align 8
+// AMDGCNSPIRV-NEXT:    [[TMP5:%.*]] = atomicrmw uinc_wrap ptr addrspace(4) addrspacecast (ptr addrspace(3) @_ZZ12test_order64vE3val to ptr addrspace(4)), i64 [[TMP4]] syncscope("workgroup") acquire, align 8, !amdgpu.no.fine.grained.memory [[META5]]
+// AMDGCNSPIRV-NEXT:    store i64 [[TMP5]], ptr addrspace(4) addrspacecast (ptr addrspace(3) @_ZZ12test_order64vE3val to ptr addrspace(4)), align 8
+// AMDGCNSPIRV-NEXT:    [[TMP6:%.*]] = load i64, ptr addrspace(4) addrspacecast (ptr addrspace(3) @_ZZ12test_order64vE3val to ptr addrspace(4)), align 8
+// AMDGCNSPIRV-NEXT:    [[TMP7:%.*]] = atomicrmw udec_wrap ptr addrspace(4) addrspacecast (ptr addrspace(3) @_ZZ12test_order64vE3val to ptr addrspace(4)), i64 [[TMP6]] syncscope("workgroup") release, align 8, !amdgpu.no.fine.grained.memory [[META5]]
+// AMDGCNSPIRV-NEXT:    store i64 [[TMP7]], ptr addrspace(4) addrspacecast (ptr addrspace(3) @_ZZ12test_order64vE3val to ptr addrspace(4)), align 8
+// AMDGCNSPIRV-NEXT:    [[TMP8:%.*]] = load i64, ptr addrspace(4) addrspacecast (ptr addrspace(3) @_ZZ12test_order64vE3val to ptr addrspace(4)), align 8
+// AMDGCNSPIRV-NEXT:    [[TMP9:%.*]] = atomicrmw udec_wrap ptr addrspace(4) addrspacecast (ptr addrspace(3) @_ZZ12test_order64vE3val to ptr addrspace(4)), i64 [[TMP8]] syncscope("workgroup") acq_rel, align 8, !amdgpu.no.fine.grained.memory [[META5]]
+// AMDGCNSPIRV-NEXT:    store i64 [[TMP9]], ptr addrspace(4) addrspacecast (ptr addrspace(3) @_ZZ12test_order64vE3val to ptr addrspace(4)), align 8
+// AMDGCNSPIRV-NEXT:    [[TMP10:%.*]] = load i64, ptr addrspace(4) addrspacecast (ptr addrspace(3) @_ZZ12test_order64vE3val to ptr addrspace(4)), align 8
+// AMDGCNSPIRV-NEXT:    [[TMP11:%.*]] = atomicrmw udec_wrap ptr addrspace(4) addrspacecast (ptr addrspace(3) @_ZZ12test_order64vE3val to ptr addrspace(4)), i64 [[TMP10]] syncscope("workgroup") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META5]]
+// AMDGCNSPIRV-NEXT:    store i64 [[TMP11]], ptr addrspace(4) addrspacecast (ptr addrspace(3) @_ZZ12test_order64vE3val to ptr addrspace(4)), align 8
+// AMDGCNSPIRV-NEXT:    ret void
 //
 __attribute__((device)) void test_order64() {
   __attribute__((shared)) __UINT64_TYPE__ val;
@@ -310,6 +669,37 @@ __attribute__((device)) void test_order64() {
 // CHECK-NEXT:    [[TMP7:%.*]] = atomicrmw udec_wrap ptr addrspacecast (ptr addrspace(3) @_ZZ12test_scope32vE3val to ptr), i32 [[TMP6]] syncscope("wavefront") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META4]]
 // CHECK-NEXT:    store i32 [[TMP7]], ptr addrspacecast (ptr addrspace(3) @_ZZ12test_scope32vE3val to ptr), align 4
 // CHECK-NEXT:    ret void
+// GCN-LABEL: @_Z12test_scope32v(
+// GCN-NEXT:  entry:
+// GCN-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspacecast (ptr addrspace(3) @_ZZ12test_scope32vE3val to ptr), align 4
+// GCN-NEXT:    [[TMP1:%.*]] = atomicrmw uinc_wrap ptr addrspacecast (ptr addrspace(3) @_ZZ12test_scope32vE3val to ptr), i32 [[TMP0]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META4]]
+// GCN-NEXT:    store i32 [[TMP1]], ptr addrspacecast (ptr addrspace(3) @_ZZ12test_scope32vE3val to ptr), align 4
+// GCN-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspacecast (ptr addrspace(3) @_ZZ12test_scope32vE3val to ptr), align 4
+// GCN-NEXT:    [[TMP3:%.*]] = atomicrmw udec_wrap ptr addrspacecast (ptr addrspace(3) @_ZZ12test_scope32vE3val to ptr), i32 [[TMP2]] syncscope("workgroup") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META4]]
+// GCN-NEXT:    store i32 [[TMP3]], ptr addrspacecast (ptr addrspace(3) @_ZZ12test_scope32vE3val to ptr), align 4
+// GCN-NEXT:    [[TMP4:%.*]] = load i32, ptr addrspacecast (ptr addrspace(3) @_ZZ12test_scope32vE3val to ptr), align 4
+// GCN-NEXT:    [[TMP5:%.*]] = atomicrmw udec_wrap ptr addrspacecast (ptr addrspace(3) @_ZZ12test_scope32vE3val to ptr), i32 [[TMP4]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META4]]
+// GCN-NEXT:    store i32 [[TMP5]], ptr addrspacecast (ptr addrspace(3) @_ZZ12test_scope32vE3val to ptr), align 4
+// GCN-NEXT:    [[TMP6:%.*]] = load i32, ptr addrspacecast (ptr addrspace(3) @_ZZ12test_scope32vE3val to ptr), align 4
+// GCN-NEXT:    [[TMP7:%.*]] = atomicrmw udec_wrap ptr addrspacecast (ptr addrspace(3) @_ZZ12test_scope32vE3val to ptr), i32 [[TMP6]] syncscope("wavefront") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META4]]
+// GCN-NEXT:    store i32 [[TMP7]], ptr addrspacecast (ptr addrspace(3) @_ZZ12test_scope32vE3val to ptr), align 4
+// GCN-NEXT:    ret void
+//
+// AMDGCNSPIRV-LABEL: @_Z12test_scope32v(
+// AMDGCNSPIRV-NEXT:  entry:
+// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(4) addrspacecast (ptr addrspace(3) @_ZZ12test_scope32vE3val to ptr addrspace(4)), align 4
+// AMDGCNSPIRV-NEXT:    [[TMP1:%.*]] = atomicrmw uinc_wrap ptr addrspace(4) addrspacecast (ptr addrspace(3) @_ZZ12test_scope32vE3val to ptr addrspace(4)), i32 [[TMP0]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META5]]
+// AMDGCNSPIRV-NEXT:    store i32 [[TMP1]], ptr addrspace(4) addrspacecast (ptr addrspace(3) @_ZZ12test_scope32vE3val to ptr addrspace(4)), align 4
+// AMDGCNSPIRV-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(4) addrspacecast (ptr addrspace(3) @_ZZ12test_scope32vE3val to ptr addrspace(4)), align 4
+// AMDGCNSPIRV-NEXT:    [[TMP3:%.*]] = atomicrmw udec_wrap ptr addrspace(4) addrspacecast (ptr addrspace(3) @_ZZ12test_scope32vE3val to ptr addrspace(4)), i32 [[TMP2]] syncscope("workgroup") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META5]]
+// AMDGCNSPIRV-NEXT:    store i32 [[TMP3]], ptr addrspace(4) addrspacecast (ptr addrspace(3) @_ZZ12test_scope32vE3val to ptr addrspace(4)), align 4
+// AMDGCNSPIRV-NEXT:    [[TMP4:%.*]] = load i32, ptr addrspace(4) addrspacecast (ptr addrspace(3) @_ZZ12test_scope32vE3val to ptr addrspace(4)), align 4
+// AMDGCNSPIRV-NEXT:    [[TMP5:%.*]] = atomicrmw udec_wrap ptr addrspace(4) addrspacecast (ptr addrspace(3) @_ZZ12test_scope32vE3val to ptr addrspace(4)), i32 [[TMP4]] syncscope("device") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META5]]
+// AMDGCNSPIRV-NEXT:    store i32 [[TMP5]], ptr addrspace(4) addrspacecast (ptr addrspace(3) @_ZZ12test_scope32vE3val to ptr addrspace(4)), align 4
+// AMDGCNSPIRV-NEXT:    [[TMP6:%.*]] = load i32, ptr addrspace(4) addrspacecast (ptr addrspace(3) @_ZZ12test_scope32vE3val to ptr addrspace(4)), align 4
+// AMDGCNSPIRV-NEXT:    [[TMP7:%.*]] = atomicrmw udec_wrap ptr addrspace(4) addrspacecast (ptr addrspace(3) @_ZZ12test_scope32vE3val to ptr addrspace(4)), i32 [[TMP6]] syncscope("subgroup") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META5]]
+// AMDGCNSPIRV-NEXT:    store i32 [[TMP7]], ptr addrspace(4) addrspacecast (ptr addrspace(3) @_ZZ12test_scope32vE3val to ptr addrspace(4)), align 4
+// AMDGCNSPIRV-NEXT:    ret void
 //
 __attribute__((device)) void test_scope32() {
   __attribute__((shared)) __UINT32_TYPE__ val;
@@ -338,6 +728,37 @@ __attribute__((device)) void test_scope32() {
 // CHECK-NEXT:    [[TMP7:%.*]] = atomicrmw udec_wrap ptr addrspacecast (ptr addrspace(3) @_ZZ12test_scope64vE3val to ptr), i64 [[TMP6]] syncscope("wavefront") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META4]]
 // CHECK-NEXT:    store i64 [[TMP7]], ptr addrspacecast (ptr addrspace(3) @_ZZ12test_scope64vE3val to ptr), align 8
 // CHECK-NEXT:    ret void
+// GCN-LABEL: @_Z12test_scope64v(
+// GCN-NEXT:  entry:
+// GCN-NEXT:    [[TMP0:%.*]] = load i64, ptr addrspacecast (ptr addrspace(3) @_ZZ12test_scope64vE3val to ptr), align 8
+// GCN-NEXT:    [[TMP1:%.*]] = atomicrmw uinc_wrap ptr addrspacecast (ptr addrspace(3) @_ZZ12test_scope64vE3val to ptr), i64 [[TMP0]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META4]]
+// GCN-NEXT:    store i64 [[TMP1]], ptr addrspacecast (ptr addrspace(3) @_ZZ12test_scope64vE3val to ptr), align 8
+// GCN-NEXT:    [[TMP2:%.*]] = load i64, ptr addrspacecast (ptr addrspace(3) @_ZZ12test_scope64vE3val to ptr), align 8
+// GCN-NEXT:    [[TMP3:%.*]] = atomicrmw udec_wrap ptr addrspacecast (ptr addrspace(3) @_ZZ12test_scope64vE3val to ptr), i64 [[TMP2]] syncscope("workgroup") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META4]]
+// GCN-NEXT:    store i64 [[TMP3]], ptr addrspacecast (ptr addrspace(3) @_ZZ12test_scope64vE3val to ptr), align 8
+// GCN-NEXT:    [[TMP4:%.*]] = load i64, ptr addrspacecast (ptr addrspace(3) @_ZZ12test_scope64vE3val to ptr), align 8
+// GCN-NEXT:    [[TMP5:%.*]] = atomicrmw udec_wrap ptr addrspacecast (ptr addrspace(3) @_ZZ12test_scope64vE3val to ptr), i64 [[TMP4]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META4]]
+// GCN-NEXT:    store i64 [[TMP5]], ptr addrspacecast (ptr addrspace(3) @_ZZ12test_scope64vE3val to ptr), align 8
+// GCN-NEXT:    [[TMP6:%.*]] = load i64, ptr addrspacecast (ptr addrspace(3) @_ZZ12test_scope64vE3val to ptr), align 8
+// GCN-NEXT:    [[TMP7:%.*]] = atomicrmw udec_wrap ptr addrspacecast (ptr addrspace(3) @_ZZ12test_scope64vE3val to ptr), i64 [[TMP6]] syncscope("wavefront") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META4]]
+// GCN-NEXT:    store i64 [[TMP7]], ptr addrspacecast (ptr addrspace(3) @_ZZ12test_scope64vE3val to ptr), align 8
+// GCN-NEXT:    ret void
+//
+// AMDGCNSPIRV-LABEL: @_Z12test_scope64v(
+// AMDGCNSPIRV-NEXT:  entry:
+// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = load i64, ptr addrspace(4) addrspacecast (ptr addrspace(3) @_ZZ12test_scope64vE3val to ptr addrspace(4)), align 8
+// AMDGCNSPIRV-NEXT:    [[TMP1:%.*]] = atomicrmw uinc_wrap ptr addrspace(4) addrspacecast (ptr addrspace(3) @_ZZ12test_scope64vE3val to ptr addrspace(4)), i64 [[TMP0]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META5]]
+// AMDGCNSPIRV-NEXT:    store i64 [[TMP1]], ptr addrspace(4) addrspacecast (ptr addrspace(3) @_ZZ12test_scope64vE3val to ptr addrspace(4)), align 8
+// AMDGCNSPIRV-NEXT:    [[TMP2:%.*]] = load i64, ptr addrspace(4) addrspacecast (ptr addrspace(3) @_ZZ12test_scope64vE3val to ptr addrspace(4)), align 8
+// AMDGCNSPIRV-NEXT:    [[TMP3:%.*]] = atomicrmw udec_wrap ptr addrspace(4) addrspacecast (ptr addrspace(3) @_ZZ12test_scope64vE3val to ptr addrspace(4)), i64 [[TMP2]] syncscope("workgroup") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META5]]
+// AMDGCNSPIRV-NEXT:    store i64 [[TMP3]], ptr addrspace(4) addrspacecast (ptr addrspace(3) @_ZZ12test_scope64vE3val to ptr addrspace(4)), align 8
+// AMDGCNSPIRV-NEXT:    [[TMP4:%.*]] = load i64, ptr addrspace(4) addrspacecast (ptr addrspace(3) @_ZZ12test_scope64vE3val to ptr addrspace(4)), align 8
+// AMDGCNSPIRV-NEXT:    [[TMP5:%.*]] = atomicrmw udec_wrap ptr addrspace(4) addrspacecast (ptr addrspace(3) @_ZZ12test_scope64vE3val to ptr addrspace(4)), i64 [[TMP4]] syncscope("device") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META5]]
+// AMDGCNSPIRV-NEXT:    store i64 [[TMP5]], ptr addrspace(4) addrspacecast (ptr addrspace(3) @_ZZ12test_scope64vE3val to ptr addrspace(4)), align 8
+// AMDGCNSPIRV-NEXT:    [[TMP6:%.*]] = load i64, ptr addrspace(4) addrspacecast (ptr addrspace(3) @_ZZ12test_scope64vE3val to ptr addrspace(4)), align 8
+// AMDGCNSPIRV-NEXT:    [[TMP7:%.*]] = atomicrmw udec_wrap ptr addrspace(4) addrspacecast (ptr addrspace(3) @_ZZ12test_scope64vE3val to ptr addrspace(4)), i64 [[TMP6]] syncscope("subgroup") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META5]]
+// AMDGCNSPIRV-NEXT:    store i64 [[TMP7]], ptr addrspace(4) addrspacecast (ptr addrspace(3) @_ZZ12test_scope64vE3val to ptr addrspace(4)), align 8
+// AMDGCNSPIRV-NEXT:    ret void
 //
 __attribute__((device)) void test_scope64() {
   __attribute__((shared)) __UINT64_TYPE__ val;
diff --git a/clang/test/CodeGenCXX/builtin-amdgcn-fence.cpp b/clang/test/CodeGenCXX/builtin-amdgcn-fence.cpp
index 1e977dd..dd1ca45 100644
--- a/clang/test/CodeGenCXX/builtin-amdgcn-fence.cpp
+++ b/clang/test/CodeGenCXX/builtin-amdgcn-fence.cpp
@@ -1,7 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
 // REQUIRES: amdgpu-registered-target
+// REQUIRES: spirv-registered-target
 // RUN: %clang_cc1 %s -emit-llvm -O0 -o - \
-// RUN:   -triple=amdgcn-amd-amdhsa | FileCheck %s
+// RUN:   -triple=amdgcn-amd-amdhsa | FileCheck --check-prefix=GCN %s
+// RUN: %clang_cc1 %s -emit-llvm -O0 -o - \
+// RUN:   -triple=spirv64-amd-amdhsa | FileCheck --check-prefix=AMDGCNSPIRV %s
 
 // CHECK-LABEL: define dso_local void @_Z25test_memory_fence_successv(
 // CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
@@ -12,6 +15,25 @@
 // CHECK-NEXT:    fence syncscope("agent") acq_rel
 // CHECK-NEXT:    fence syncscope("workgroup") release
 // CHECK-NEXT:    ret void
+// GCN-LABEL: define dso_local void @_Z25test_memory_fence_successv(
+// GCN-SAME: ) #[[ATTR0:[0-9]+]] {
+// GCN-NEXT:  entry:
+// GCN-NEXT:    fence syncscope("workgroup") seq_cst
+// GCN-NEXT:    fence syncscope("agent") acquire
+// GCN-NEXT:    fence seq_cst
+// GCN-NEXT:    fence syncscope("agent") acq_rel
+// GCN-NEXT:    fence syncscope("workgroup") release
+// GCN-NEXT:    ret void
+//
+// AMDGCNSPIRV-LABEL: define spir_func void @_Z25test_memory_fence_successv(
+// AMDGCNSPIRV-SAME: ) addrspace(4) #[[ATTR0:[0-9]+]] {
+// AMDGCNSPIRV-NEXT:  entry:
+// AMDGCNSPIRV-NEXT:    fence syncscope("workgroup") seq_cst
+// AMDGCNSPIRV-NEXT:    fence syncscope("device") acquire
+// AMDGCNSPIRV-NEXT:    fence seq_cst
+// AMDGCNSPIRV-NEXT:    fence syncscope("device") acq_rel
+// AMDGCNSPIRV-NEXT:    fence syncscope("workgroup") release
+// AMDGCNSPIRV-NEXT:    ret void
 //
 void test_memory_fence_success() {
 
@@ -35,6 +57,25 @@ void test_memory_fence_success() {
 // CHECK-NEXT:    fence syncscope("agent") acq_rel, !mmra [[META3]]
 // CHECK-NEXT:    fence syncscope("workgroup") release, !mmra [[META3]]
 // CHECK-NEXT:    ret void
+// GCN-LABEL: define dso_local void @_Z10test_localv(
+// GCN-SAME: ) #[[ATTR0]] {
+// GCN-NEXT:  entry:
+// GCN-NEXT:    fence syncscope("workgroup") seq_cst, !mmra [[META3:![0-9]+]]
+// GCN-NEXT:    fence syncscope("agent") acquire, !mmra [[META3]]
+// GCN-NEXT:    fence seq_cst, !mmra [[META3]]
+// GCN-NEXT:    fence syncscope("agent") acq_rel, !mmra [[META3]]
+// GCN-NEXT:    fence syncscope("workgroup") release, !mmra [[META3]]
+// GCN-NEXT:    ret void
+//
+// AMDGCNSPIRV-LABEL: define spir_func void @_Z10test_localv(
+// AMDGCNSPIRV-SAME: ) addrspace(4) #[[ATTR0]] {
+// AMDGCNSPIRV-NEXT:  entry:
+// AMDGCNSPIRV-NEXT:    fence syncscope("workgroup") seq_cst, !mmra [[META3:![0-9]+]]
+// AMDGCNSPIRV-NEXT:    fence syncscope("device") acquire, !mmra [[META3]]
+// AMDGCNSPIRV-NEXT:    fence seq_cst, !mmra [[META3]]
+// AMDGCNSPIRV-NEXT:    fence syncscope("device") acq_rel, !mmra [[META3]]
+// AMDGCNSPIRV-NEXT:    fence syncscope("workgroup") release, !mmra [[META3]]
+// AMDGCNSPIRV-NEXT:    ret void
 //
 void test_local() {
   __builtin_amdgcn_fence( __ATOMIC_SEQ_CST, "workgroup", "local");
@@ -58,6 +99,25 @@ void test_local() {
 // CHECK-NEXT:    fence syncscope("agent") acq_rel, !mmra [[META4]]
 // CHECK-NEXT:    fence syncscope("workgroup") release, !mmra [[META4]]
 // CHECK-NEXT:    ret void
+// GCN-LABEL: define dso_local void @_Z11test_globalv(
+// GCN-SAME: ) #[[ATTR0]] {
+// GCN-NEXT:  entry:
+// GCN-NEXT:    fence syncscope("workgroup") seq_cst, !mmra [[META4:![0-9]+]]
+// GCN-NEXT:    fence syncscope("agent") acquire, !mmra [[META4]]
+// GCN-NEXT:    fence seq_cst, !mmra [[META4]]
+// GCN-NEXT:    fence syncscope("agent") acq_rel, !mmra [[META4]]
+// GCN-NEXT:    fence syncscope("workgroup") release, !mmra [[META4]]
+// GCN-NEXT:    ret void
+//
+// AMDGCNSPIRV-LABEL: define spir_func void @_Z11test_globalv(
+// AMDGCNSPIRV-SAME: ) addrspace(4) #[[ATTR0]] {
+// AMDGCNSPIRV-NEXT:  entry:
+// AMDGCNSPIRV-NEXT:    fence syncscope("workgroup") seq_cst, !mmra [[META4:![0-9]+]]
+// AMDGCNSPIRV-NEXT:    fence syncscope("device") acquire, !mmra [[META4]]
+// AMDGCNSPIRV-NEXT:    fence seq_cst, !mmra [[META4]]
+// AMDGCNSPIRV-NEXT:    fence syncscope("device") acq_rel, !mmra [[META4]]
+// AMDGCNSPIRV-NEXT:    fence syncscope("workgroup") release, !mmra [[META4]]
+// AMDGCNSPIRV-NEXT:    ret void
 //
 void test_global() {
   __builtin_amdgcn_fence( __ATOMIC_SEQ_CST, "workgroup", "global");
@@ -80,6 +140,25 @@ void test_global() {
 // CHECK-NEXT:    fence syncscope("agent") acq_rel, !mmra [[META3]]
 // CHECK-NEXT:    fence syncscope("workgroup") release, !mmra [[META3]]
 // CHECK-NEXT:    ret void
+// GCN-LABEL: define dso_local void @_Z10test_imagev(
+// GCN-SAME: ) #[[ATTR0]] {
+// GCN-NEXT:  entry:
+// GCN-NEXT:    fence syncscope("workgroup") seq_cst, !mmra [[META3]]
+// GCN-NEXT:    fence syncscope("agent") acquire, !mmra [[META3]]
+// GCN-NEXT:    fence seq_cst, !mmra [[META3]]
+// GCN-NEXT:    fence syncscope("agent") acq_rel, !mmra [[META3]]
+// GCN-NEXT:    fence syncscope("workgroup") release, !mmra [[META3]]
+// GCN-NEXT:    ret void
+//
+// AMDGCNSPIRV-LABEL: define spir_func void @_Z10test_imagev(
+// AMDGCNSPIRV-SAME: ) addrspace(4) #[[ATTR0]] {
+// AMDGCNSPIRV-NEXT:  entry:
+// AMDGCNSPIRV-NEXT:    fence syncscope("workgroup") seq_cst, !mmra [[META3]]
+// AMDGCNSPIRV-NEXT:    fence syncscope("device") acquire, !mmra [[META3]]
+// AMDGCNSPIRV-NEXT:    fence seq_cst, !mmra [[META3]]
+// AMDGCNSPIRV-NEXT:    fence syncscope("device") acq_rel, !mmra [[META3]]
+// AMDGCNSPIRV-NEXT:    fence syncscope("workgroup") release, !mmra [[META3]]
+// AMDGCNSPIRV-NEXT:    ret void
 //
 void test_image() {
   __builtin_amdgcn_fence( __ATOMIC_SEQ_CST, "workgroup", "local");
@@ -99,13 +178,33 @@ void test_image() {
 // CHECK-NEXT:    fence syncscope("workgroup") seq_cst, !mmra [[META5:![0-9]+]]
 // CHECK-NEXT:    fence syncscope("workgroup") seq_cst, !mmra [[META5]]
 // CHECK-NEXT:    ret void
+// GCN-LABEL: define dso_local void @_Z10test_mixedv(
+// GCN-SAME: ) #[[ATTR0]] {
+// GCN-NEXT:  entry:
+// GCN-NEXT:    fence syncscope("workgroup") seq_cst, !mmra [[META5:![0-9]+]]
+// GCN-NEXT:    fence syncscope("workgroup") seq_cst, !mmra [[META5]]
+// GCN-NEXT:    ret void
+//
+// AMDGCNSPIRV-LABEL: define spir_func void @_Z10test_mixedv(
+// AMDGCNSPIRV-SAME: ) addrspace(4) #[[ATTR0]] {
+// AMDGCNSPIRV-NEXT:  entry:
+// AMDGCNSPIRV-NEXT:    fence syncscope("workgroup") seq_cst, !mmra [[META5:![0-9]+]]
+// AMDGCNSPIRV-NEXT:    fence syncscope("workgroup") seq_cst, !mmra [[META5]]
+// AMDGCNSPIRV-NEXT:    ret void
 //
 void test_mixed() {
   __builtin_amdgcn_fence( __ATOMIC_SEQ_CST, "workgroup", "local", "global");
   __builtin_amdgcn_fence( __ATOMIC_SEQ_CST, "workgroup", "local", "local", "global", "local", "local");
 }
-//.
 // CHECK: [[META3]] = !{!"amdgpu-synchronize-as", !"local"}
 // CHECK: [[META4]] = !{!"amdgpu-synchronize-as", !"global"}
 // CHECK: [[META5]] = !{[[META4]], [[META3]]}
 //.
+// GCN: [[META3]] = !{!"amdgpu-synchronize-as", !"local"}
+// GCN: [[META4]] = !{!"amdgpu-synchronize-as", !"global"}
+// GCN: [[META5]] = !{[[META4]], [[META3]]}
+//.
+// AMDGCNSPIRV: [[META3]] = !{!"amdgpu-synchronize-as", !"local"}
+// AMDGCNSPIRV: [[META4]] = !{!"amdgpu-synchronize-as", !"global"}
+// AMDGCNSPIRV: [[META5]] = !{[[META4]], [[META3]]}
+//.
diff --git a/clang/test/CodeGenCXX/gh56652.cpp b/clang/test/CodeGenCXX/gh56652.cpp
new file mode 100644
index 0000000..06a496e
--- /dev/null
+++ b/clang/test/CodeGenCXX/gh56652.cpp
@@ -0,0 +1,41 @@
+// RUN: %clang_cc1 -std=c++20 -triple x86_64-elf-gnu %s -emit-llvm -o - | FileCheck %s
+
+namespace GH56652{
+
+struct foo {};
+
+template <typename T> struct bar {
+    using type = T;
+
+    template <foo> inline static constexpr auto b = true;
+};
+
+template <typename T>
+concept C = requires(T a) { T::template b<foo{}>; };
+
+template <typename T> auto fn(T) {
+    if constexpr (!C<T>)
+        return foo{};
+    else
+        return T{};
+}
+
+auto a = decltype(fn(bar<int>{})){};
+
+}
+
+namespace GH116319 {
+
+template <int = 0> struct a {
+template <class> static constexpr auto b = 2;
+template <class> static void c() noexcept(noexcept(b<int>)) {}
+};
+
+void test() { a<>::c<int>(); }
+
+
+}
+
+// CHECK: %"struct.GH56652::bar" = type { i8 }
+// CHECK: $_ZN8GH1163191aILi0EE1cIiEEvv = comdat any
+// CHECK: @_ZN7GH566521aE = global %"struct.GH56652::bar" undef
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx11.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx11.cl
index 19ab656..7cd3f14 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx11.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx11.cl
@@ -1,13 +1,13 @@
 // REQUIRES: amdgpu-registered-target
-// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1100 -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1101 -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1102 -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1103 -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1150 -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1151 -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1152 -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1153 -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -triple spirv64-amd-amdhsa -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1100 -emit-llvm -o - %s | FileCheck --check-prefixes=CHECK,GCN %s
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1101 -emit-llvm -o - %s | FileCheck --check-prefixes=CHECK,GCN %s
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1102 -emit-llvm -o - %s | FileCheck --check-prefixes=CHECK,GCN %s
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1103 -emit-llvm -o - %s | FileCheck --check-prefixes=CHECK,GCN %s
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1150 -emit-llvm -o - %s | FileCheck --check-prefixes=CHECK,GCN %s
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1151 -emit-llvm -o - %s | FileCheck --check-prefixes=CHECK,GCN %s
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1152 -emit-llvm -o - %s | FileCheck --check-prefixes=CHECK,GCN %s
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1153 -emit-llvm -o - %s | FileCheck --check-prefixes=CHECK,GCN %s
+// RUN: %clang_cc1 -triple spirv64-amd-amdhsa -emit-llvm -o - %s | FileCheck --check-prefixes=CHECK,AMDGCNSPIRV %s
 
 typedef unsigned int uint;
 typedef unsigned long ulong;
@@ -50,7 +50,8 @@ void test_s_wait_event_export_ready() {
 }
 
 // CHECK-LABEL: @test_global_add_f32
-// CHECK: = atomicrmw fadd ptr addrspace(1) %addr, float %x syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+}}, !amdgpu.ignore.denormal.mode !{{[0-9]+$}}
+// GCN: = atomicrmw fadd ptr addrspace(1) %addr, float %x syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+}}, !amdgpu.ignore.denormal.mode !{{[0-9]+$}}
+// AMDGCNSPIRV: = atomicrmw fadd ptr addrspace(1) %addr, float %x syncscope("device") monotonic, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+}}, !amdgpu.ignore.denormal.mode !{{[0-9]+$}}
 #if !defined(__SPIRV__)
 void test_global_add_f32(float *rtn, global float *addr, float x) {
 #else
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-vi.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-vi.cl
index 5f202ba..6bb20bf 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-vi.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-vi.cl
@@ -1,9 +1,9 @@
 // REQUIRES: amdgpu-registered-target
-// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu tonga -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx900 -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1010 -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1012 -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -triple spirv64-amd-amdhsa -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu tonga -emit-llvm -o - %s | FileCheck --check-prefixes=CHECK,GCN %s
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx900 -emit-llvm -o - %s | FileCheck --check-prefixes=CHECK,GCN %s
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1010 -emit-llvm -o - %s | FileCheck --check-prefixes=CHECK,GCN %s
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1012 -emit-llvm -o - %s | FileCheck --check-prefixes=CHECK,GCN %s
+// RUN: %clang_cc1 -triple spirv64-amd-amdhsa -emit-llvm -o - %s | FileCheck --check-prefixes=CHECK,AMDGCNSPIRV %s
 
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 
@@ -252,9 +252,11 @@ void test_update_dpp_const_int(global int* out, int arg1)
 // CHECK: atomicrmw fadd ptr addrspace(3) %out, float %src seq_cst, align 4{{$}}
 // CHECK: atomicrmw fadd ptr addrspace(3) %out, float %src seq_cst, align 4{{$}}
 
-// CHECK: atomicrmw fadd ptr addrspace(3) %out, float %src syncscope("agent") monotonic, align 4{{$}}
+// GCN: atomicrmw fadd ptr addrspace(3) %out, float %src syncscope("agent") monotonic, align 4{{$}}
+// AMDGCNSPIRV: atomicrmw fadd ptr addrspace(3) %out, float %src syncscope("device") monotonic, align 4{{$}}
 // CHECK: atomicrmw fadd ptr addrspace(3) %out, float %src syncscope("workgroup") monotonic, align 4{{$}}
-// CHECK: atomicrmw fadd ptr addrspace(3) %out, float %src syncscope("wavefront") monotonic, align 4{{$}}
+// GCN: atomicrmw fadd ptr addrspace(3) %out, float %src syncscope("wavefront") monotonic, align 4{{$}}
+// AMDGCNSPIRV: atomicrmw fadd ptr addrspace(3) %out, float %src syncscope("subgroup") monotonic, align 4{{$}}
 // CHECK: atomicrmw fadd ptr addrspace(3) %out, float %src syncscope("singlethread") monotonic, align 4{{$}}
 // CHECK: atomicrmw fadd ptr addrspace(3) %out, float %src monotonic, align 4{{$}}
 #if !defined(__SPIRV__)
@@ -293,9 +295,11 @@ void test_ds_faddf(local float *out, float src) {
 // CHECK: atomicrmw fmin ptr addrspace(3) %out, float %src seq_cst, align 4{{$}}
 // CHECK: atomicrmw fmin ptr addrspace(3) %out, float %src seq_cst, align 4{{$}}
 
-// CHECK: atomicrmw fmin ptr addrspace(3) %out, float %src syncscope("agent") monotonic, align 4{{$}}
+// GCN: atomicrmw fmin ptr addrspace(3) %out, float %src syncscope("agent") monotonic, align 4{{$}}
+// AMDGCNSPIRV: atomicrmw fmin ptr addrspace(3) %out, float %src syncscope("device") monotonic, align 4{{$}}
 // CHECK: atomicrmw fmin ptr addrspace(3) %out, float %src syncscope("workgroup") monotonic, align 4{{$}}
-// CHECK: atomicrmw fmin ptr addrspace(3) %out, float %src syncscope("wavefront") monotonic, align 4{{$}}
+// GCN: atomicrmw fmin ptr addrspace(3) %out, float %src syncscope("wavefront") monotonic, align 4{{$}}
+// AMDGCNSPIRV: atomicrmw fmin ptr addrspace(3) %out, float %src syncscope("subgroup") monotonic, align 4{{$}}
 // CHECK: atomicrmw fmin ptr addrspace(3) %out, float %src syncscope("singlethread") monotonic, align 4{{$}}
 // CHECK: atomicrmw fmin ptr addrspace(3) %out, float %src monotonic, align 4{{$}}
 
@@ -334,9 +338,11 @@ void test_ds_fminf(__attribute__((address_space(3))) float *out, float src) {
 // CHECK: atomicrmw fmax ptr addrspace(3) %out, float %src seq_cst, align 4{{$}}
 // CHECK: atomicrmw fmax ptr addrspace(3) %out, float %src seq_cst, align 4{{$}}
 
-// CHECK: atomicrmw fmax ptr addrspace(3) %out, float %src syncscope("agent") monotonic, align 4{{$}}
+// GCN: atomicrmw fmax ptr addrspace(3) %out, float %src syncscope("agent") monotonic, align 4{{$}}
+// AMDGCNSPIRV: atomicrmw fmax ptr addrspace(3) %out, float %src syncscope("device") monotonic, align 4{{$}}
 // CHECK: atomicrmw fmax ptr addrspace(3) %out, float %src syncscope("workgroup") monotonic, align 4{{$}}
-// CHECK: atomicrmw fmax ptr addrspace(3) %out, float %src syncscope("wavefront") monotonic, align 4{{$}}
+// GCN: atomicrmw fmax ptr addrspace(3) %out, float %src syncscope("wavefront") monotonic, align 4{{$}}
+// AMDGCNSPIRV: atomicrmw fmax ptr addrspace(3) %out, float %src syncscope("subgroup") monotonic, align 4{{$}}
 // CHECK: atomicrmw fmax ptr addrspace(3) %out, float %src syncscope("singlethread") monotonic, align 4{{$}}
 // CHECK: atomicrmw fmax ptr addrspace(3) %out, float %src monotonic, align 4{{$}}
 
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl
index 039d032..ab0b0b9 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl
@@ -1231,7 +1231,8 @@ void test_atomic_inc_dec(__attribute__((address_space(3))) uint *lptr, __attribu
   // CHECK: atomicrmw udec_wrap ptr addrspace(3) %lptr, i32 %val syncscope("workgroup") seq_cst, align 4
   res = __builtin_amdgcn_atomic_dec32(lptr, val, __ATOMIC_SEQ_CST, "workgroup");
 
-  // CHECK: atomicrmw uinc_wrap ptr addrspace(1) %gptr, i32 %val syncscope("agent") seq_cst, align 4
+  // CHECK-AMDGCN: atomicrmw uinc_wrap ptr addrspace(1) %gptr, i32 %val syncscope("agent") seq_cst, align 4
+  // CHECK-SPIRV: atomicrmw uinc_wrap ptr addrspace(1) %gptr, i32 %val syncscope("device") seq_cst, align 4
   res = __builtin_amdgcn_atomic_inc32(gptr, val, __ATOMIC_SEQ_CST, "agent");
 
   // CHECK: atomicrmw udec_wrap ptr addrspace(1) %gptr, i32 %val seq_cst, align 4
diff --git a/clang/test/Driver/modules-print-library-module-manifest-path.cpp b/clang/test/Driver/modules-print-library-module-manifest-path.cpp
index 7606713..af0f124 100644
--- a/clang/test/Driver/modules-print-library-module-manifest-path.cpp
+++ b/clang/test/Driver/modules-print-library-module-manifest-path.cpp
@@ -18,6 +18,14 @@
 // RUN:     --target=x86_64-linux-gnu 2>&1 \
 // RUN:   | FileCheck libcxx.cpp
 
+// check that -nostdlib causes no library-provided module manifest to
+// be reported, even when libc++.modules.json is present.
+// RUN: %clang -print-library-module-manifest-path \
+// RUN:     -nostdlib \
+// RUN:     -resource-dir=%t/Inputs/usr/lib/x86_64-linux-gnu \
+// RUN:     --target=x86_64-linux-gnu 2>&1 \
+// RUN:   | FileCheck libcxx-no-module-json.cpp
+
 // for macos there is a different directory structure
 // where the library and libc++.modules.json file are in lib
 // directly but headers are in clang/ver directory which
diff --git a/clang/test/Lexer/cxx-features.cpp b/clang/test/Lexer/cxx-features.cpp
index ced5bca..8eb9ea0 100644
--- a/clang/test/Lexer/cxx-features.cpp
+++ b/clang/test/Lexer/cxx-features.cpp
@@ -148,7 +148,7 @@
 
 // init_captures checked below
 
-#if check(modules, 0, 0, 0, 0, 0, 0, 0)
+#if check(modules, 0, 0, 0, 0, 1, 1, 1)
 // FIXME: 201907 in C++20
 #error "wrong value for __cpp_modules"
 #endif
diff --git a/clang/test/OpenMP/for_reduction_codegen.cpp b/clang/test/OpenMP/for_reduction_codegen.cpp
index 83632db..cb4bcc9 100644
--- a/clang/test/OpenMP/for_reduction_codegen.cpp
+++ b/clang/test/OpenMP/for_reduction_codegen.cpp
@@ -27,7 +27,6 @@ struct S {
   ~S() {}
 };
 
-
 template <typename T, int length>
 T tmain() {
   T t;
@@ -60,6 +59,15 @@ T tmain() {
 }
 
 extern S<float> **foo();
+int g_arr[10];
+
+void reductionArrayElement() {
+#pragma omp parallel
+#pragma omp for reduction(+:g_arr[1])
+  for (int i = 0; i < 10; i++) {
+    g_arr[1] += i;
+  }
+}
 
 int main() {
 #ifdef LAMBDA
@@ -164,6 +172,7 @@ int main() {
 #pragma omp for reduction(& : var3)
   for (int i = 0; i < 10; ++i)
     ;
+  reductionArrayElement();
   return tmain<int, 42>();
 #endif
 }
@@ -535,6 +544,26 @@ int main() {
 //.
 // CHECK4: @.gomp_critical_user_.reduction.var = common global [8 x i32] zeroinitializer, align 8
 //.
+
+// CHECK1-LABEL: define {{.*}}reductionArrayElement{{.*}}.omp_outlined{{.*}}
+// CHECK1-NEXT:  entry:
+// CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1:         [[G_ARR:%.*]] = alloca i32, align 4
+// CHECK1:         [[TMP0:%.*]] = sdiv exact i64 sub (i64 ptrtoint (ptr @g_arr to i64){{.*}}
+// CHECK1-NEXT:    [[TMP1:%.*]] = getelementptr i32, ptr [[G_ARR:%.*]], i64 [[TMP0]]
+// CHECK1:       omp.inner.for.body:
+// CHECK1:         [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[TMP1]], i64 0, i64 1
+// CHECK1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+// CHECK1-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP11]],{{.+}}
+// CHECK1-NEXT:    store i32 [[ADD2]], ptr [[ARRAYIDX]], align 4
+// CHECK1:       omp.loop.exit:
+// CHECK1-NEXT:    call void {{.*}}__kmpc_for_static_fini{{.+}}
+// CHECK1:         {{.*}}call i32 {{.*}}__kmpc_reduce{{.+}}
+// CHECK1:       omp.reduction.default:
+// CHECK1-NEXT:    call void @__kmpc_barrier{{.+}}
+// CHECK1-NEXT:    ret void
+//
+
 // CHECK1-LABEL: define {{[^@]+}}@main
 // CHECK1-SAME: () #[[ATTR0:[0-9]+]] {
 // CHECK1-NEXT:  entry:
@@ -614,6 +643,7 @@ int main() {
 // CHECK1-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3]], i32 1, ptr @main.omp_outlined.11, ptr [[TMP7]])
 // CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[VAR3]], align 8
 // CHECK1-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3]], i32 1, ptr @main.omp_outlined.12, ptr [[TMP8]])
+// CHECK1-NEXT:    call void {{.*}}reductionArrayElement{{.*}}
 // CHECK1-NEXT:    [[CALL10:%.*]] = call noundef i32 @_Z5tmainIiLi42EET_v()
 // CHECK1-NEXT:    store i32 [[CALL10]], ptr [[RETVAL]], align 4
 // CHECK1-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[SAVED_STACK]], align 8
diff --git a/clang/test/OpenMP/fuse_ast_print.cpp b/clang/test/OpenMP/fuse_ast_print.cpp
new file mode 100644
index 0000000..283f588
--- /dev/null
+++ b/clang/test/OpenMP/fuse_ast_print.cpp
@@ -0,0 +1,397 @@
+// Check no warnings/errors
+// RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -fopenmp -std=c++20 -fopenmp-version=60 -fsyntax-only -verify %s
+// expected-no-diagnostics
+
+// Check AST and unparsing 
+// RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -fopenmp -std=c++20 -fopenmp-version=60 -ast-dump  %s | FileCheck %s --check-prefix=DUMP
+// RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -fopenmp -std=c++20 -fopenmp-version=60 -ast-print %s | FileCheck %s --check-prefix=PRINT
+
+// Check same results after serialization round-trip 
+// RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -fopenmp -std=c++20 -fopenmp-version=60 -emit-pch -o %t %s
+// RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -fopenmp -std=c++20 -fopenmp-version=60 -include-pch %t -ast-dump-all %s | FileCheck %s --check-prefix=DUMP
+// RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -fopenmp -std=c++20 -fopenmp-version=60 -include-pch %t -ast-print    %s | FileCheck %s --check-prefix=PRINT
+
+#ifndef HEADER
+#define HEADER 
+
+// placeholder for loop body code
+extern "C" void body(...);
+
+// PRINT-LABEL: void foo1(
+// DUMP-LABEL: FunctionDecl {{.*}} foo1
+void foo1() {
+    // PRINT: #pragma omp fuse
+    // DUMP:  OMPFuseDirective
+    #pragma omp fuse 
+    // PRINT: {
+    // DUMP: CompoundStmt       
+    {
+        // PRINT: for (int i = 0; i < 10; i += 2)
+        // DUMP: ForStmt
+        for (int i = 0; i < 10; i += 2)
+            // PRINT: body(i)
+            // DUMP: CallExpr
+            body(i);
+        // PRINT: for (int j = 10; j > 0; --j)
+        // DUMP: ForStmt
+        for (int j = 10; j > 0; --j)
+            // PRINT: body(j)
+            // DUMP: CallExpr
+            body(j);
+        // PRINT: for (int k = 0; k <= 10; ++k)
+        // DUMP: ForStmt
+        for (int k = 0; k <= 10; ++k)
+            // PRINT: body(k)
+            // DUMP: CallExpr
+            body(k);
+
+    }
+
+}
+
+// PRINT-LABEL: void foo2(
+// DUMP-LABEL: FunctionDecl {{.*}} foo2
+void foo2() {
+    // PRINT: #pragma omp unroll partial(4)
+    // DUMP: OMPUnrollDirective
+    // DUMP-NEXT: OMPPartialClause
+    // DUMP-NEXT: ConstantExpr
+    // DUMP-NEXT: value: Int 4
+    // DUMP-NEXT: IntegerLiteral {{.*}} 4
+    #pragma omp unroll partial(4)
+    // PRINT: #pragma omp fuse
+    // DUMP-NEXT: OMPFuseDirective 
+    #pragma omp fuse 
+    // PRINT: {
+    // DUMP: CompoundStmt       
+    {
+        // PRINT: for (int i = 0; i < 10; i += 2)
+        // DUMP: ForStmt
+        for (int i = 0; i < 10; i += 2)
+            // PRINT: body(i)
+            // DUMP: CallExpr
+            body(i);
+        // PRINT: for (int j = 10; j > 0; --j)
+        // DUMP: ForStmt
+        for (int j = 10; j > 0; --j)
+            // PRINT: body(j)
+            // DUMP: CallExpr
+            body(j);  
+    }    
+    
+}
+
+//PRINT-LABEL: void foo3(
+//DUMP-LABEL: FunctionTemplateDecl {{.*}} foo3
+template<int Factor1, int Factor2> 
+void foo3() {
+    // PRINT:  #pragma omp fuse
+    // DUMP: OMPFuseDirective
+    #pragma omp fuse 
+    // PRINT: {
+    // DUMP: CompoundStmt       
+    {
+        // PRINT: #pragma omp unroll partial(Factor1)
+        // DUMP: OMPUnrollDirective
+        #pragma omp unroll partial(Factor1)
+        // PRINT: for (int i = 0; i < 12; i += 1)
+        // DUMP: ForStmt
+        for (int i = 0; i < 12; i += 1)
+            // PRINT: body(i)
+            // DUMP: CallExpr
+            body(i);
+        // PRINT: #pragma omp unroll partial(Factor2)
+        // DUMP: OMPUnrollDirective
+        #pragma omp unroll partial(Factor2)
+        // PRINT: for (int k = 0; k <= 10; ++k)
+        // DUMP: ForStmt
+        for (int k = 0; k <= 10; ++k)
+            // PRINT: body(k)
+            // DUMP: CallExpr
+            body(k);
+
+    }
+}
+
+// Also test instantiating the template.
+void tfoo3() {
+    foo3<4,2>();
+}
+
+//PRINT-LABEL: void foo4(
+//DUMP-LABEL: FunctionTemplateDecl {{.*}} foo4
+template<typename T, T Step> 
+void foo4(int start, int end) {
+    // PRINT:  #pragma omp fuse
+    // DUMP: OMPFuseDirective
+    #pragma omp fuse 
+    // PRINT: {
+    // DUMP: CompoundStmt       
+    {
+        // PRINT: for (T i = start; i < end; i += Step)
+        // DUMP: ForStmt
+        for (T i = start; i < end; i += Step)
+            // PRINT: body(i)
+            // DUMP: CallExpr
+            body(i);
+
+        // PRINT: for (T j = end; j > start; j -= Step)
+        // DUMP: ForStmt 
+        for (T j = end; j > start; j -= Step) {
+            // PRINT: body(j)
+            // DUMP: CallExpr
+            body(j);
+        }
+
+    }
+}
+
+// Also test instantiating the template.
+void tfoo4() {
+    foo4<int, 4>(0, 64);
+}
+
+
+
+// PRINT-LABEL: void foo5(
+// DUMP-LABEL: FunctionDecl {{.*}} foo5
+void foo5() {
+    double arr[128], arr2[128];
+    // PRINT: #pragma omp fuse
+    // DUMP:  OMPFuseDirective
+    #pragma omp fuse 
+    // PRINT: {
+    // DUMP: CompoundStmt       
+    {
+        // PRINT-NEXT: for (auto &&a : arr)
+        // DUMP-NEXT: CXXForRangeStmt
+        for (auto &&a: arr)
+            // PRINT: body(a)
+            // DUMP: CallExpr
+            body(a);
+        // PRINT: for (double v = 42; auto &&b : arr)
+        // DUMP: CXXForRangeStmt
+        for (double v = 42; auto &&b: arr)
+            // PRINT: body(b, v);
+            // DUMP: CallExpr
+            body(b, v);
+        // PRINT: for (auto &&c : arr2)
+        // DUMP: CXXForRangeStmt
+        for (auto &&c: arr2)
+            // PRINT: body(c)
+            // DUMP: CallExpr
+            body(c);
+
+    }
+
+}
+
+// PRINT-LABEL: void foo6(
+// DUMP-LABEL: FunctionDecl {{.*}} foo6
+void foo6() {
+    // PRINT: #pragma omp fuse
+    // DUMP: OMPFuseDirective
+    #pragma omp fuse 
+    // PRINT: {
+    // DUMP: CompoundStmt
+    {
+        // PRINT: #pragma omp fuse
+        // DUMP: OMPFuseDirective
+        #pragma omp fuse 
+        // PRINT: {
+        // DUMP: CompoundStmt
+        {
+            // PRINT: for (int i = 0; i <= 10; ++i)
+            // DUMP: ForStmt
+            for (int i = 0; i <= 10; ++i)
+                body(i);
+            // PRINT: for (int j = 0; j < 100; ++j)
+            // DUMP: ForStmt
+            for(int j = 0; j < 100; ++j)
+                body(j);
+        }
+        // PRINT: #pragma omp unroll partial(4)
+        // DUMP: OMPUnrollDirective
+        #pragma omp unroll partial(4)
+        // PRINT: for (int k = 0; k < 250; ++k)
+        // DUMP: ForStmt
+        for (int k = 0; k < 250; ++k) 
+            body(k);
+    }
+}
+
+// PRINT-LABEL: void foo7(
+// DUMP-LABEL: FunctionDecl {{.*}} foo7
+void foo7() {
+    // PRINT: #pragma omp fuse
+    // DUMP:  OMPFuseDirective
+    #pragma omp fuse 
+    // PRINT: {
+    // DUMP: CompoundStmt       
+    {
+        // PRINT: {
+        // DUMP: CompoundStmt   
+        {
+            // PRINT: {
+            // DUMP: CompoundStmt   
+            {
+                // PRINT: for (int i = 0; i < 10; i += 2)
+                // DUMP: ForStmt
+                for (int i = 0; i < 10; i += 2)
+                    // PRINT: body(i)
+                    // DUMP: CallExpr
+                    body(i);
+                // PRINT: for (int j = 10; j > 0; --j)
+                // DUMP: ForStmt
+                for (int j = 10; j > 0; --j)
+                    // PRINT: body(j)
+                    // DUMP: CallExpr
+                    body(j);
+            }
+        }
+        // PRINT: {
+        // DUMP: CompoundStmt   
+        {
+            // PRINT: {
+            // DUMP: CompoundStmt   
+            {
+                // PRINT: {
+                // DUMP: CompoundStmt   
+                {
+                    // PRINT: for (int k = 0; k <= 10; ++k)
+                    // DUMP: ForStmt
+                    for (int k = 0; k <= 10; ++k)
+                        // PRINT: body(k)
+                        // DUMP: CallExpr
+                        body(k);
+                }
+            }
+        }
+    }
+
+}
+
+// PRINT-LABEL: void foo8(
+// DUMP-LABEL: FunctionDecl {{.*}} foo8
+void foo8() {
+    // PRINT: #pragma omp fuse looprange(2,2)
+    // DUMP:  OMPFuseDirective
+    // DUMP: OMPLooprangeClause
+    #pragma omp fuse looprange(2,2)
+    // PRINT: {
+    // DUMP: CompoundStmt       
+    {
+        // PRINT: for (int i = 0; i < 10; i += 2)
+        // DUMP: ForStmt
+        for (int i = 0; i < 10; i += 2)
+            // PRINT: body(i)
+            // DUMP: CallExpr
+            body(i);
+        // PRINT: for (int j = 10; j > 0; --j)
+        // DUMP: ForStmt
+        for (int j = 10; j > 0; --j)
+            // PRINT: body(j)
+            // DUMP: CallExpr
+            body(j);
+        // PRINT: for (int k = 0; k <= 10; ++k)
+        // DUMP: ForStmt
+        for (int k = 0; k <= 10; ++k)
+            // PRINT: body(k)
+            // DUMP: CallExpr
+            body(k);
+
+    }
+
+}
+
+//PRINT-LABEL: void foo9(
+//DUMP-LABEL: FunctionTemplateDecl {{.*}} foo9
+//DUMP-LABEL: NonTypeTemplateParmDecl {{.*}} F
+//DUMP-LABEL: NonTypeTemplateParmDecl {{.*}} C
+template<int F, int C> 
+void foo9() {
+    // PRINT:  #pragma omp fuse looprange(F,C)
+    // DUMP: OMPFuseDirective
+    // DUMP: OMPLooprangeClause
+    #pragma omp fuse looprange(F,C)
+    // PRINT: {
+    // DUMP: CompoundStmt       
+    {
+        // PRINT: for (int i = 0; i < 10; i += 2)
+        // DUMP: ForStmt
+        for (int i = 0; i < 10; i += 2)
+            // PRINT: body(i)
+            // DUMP: CallExpr
+            body(i);
+        // PRINT: for (int j = 10; j > 0; --j)
+        // DUMP: ForStmt
+        for (int j = 10; j > 0; --j)
+            // PRINT: body(j)
+            // DUMP: CallExpr
+            body(j);
+
+    }
+}
+
+// Also test instantiating the template.
+void tfoo9() {
+    foo9<1, 2>();
+}
+
+// PRINT-LABEL: void foo10(
+// DUMP-LABEL: FunctionDecl {{.*}} foo10
+void foo10() {
+    // PRINT: #pragma omp fuse looprange(2,2)
+    // DUMP:  OMPFuseDirective
+    // DUMP: OMPLooprangeClause
+    #pragma omp fuse looprange(2,2)
+    // PRINT: {
+    // DUMP: CompoundStmt       
+    {
+        // PRINT: for (int i = 0; i < 10; i += 2)
+        // DUMP: ForStmt
+        for (int i = 0; i < 10; i += 2)
+            // PRINT: body(i)
+            // DUMP: CallExpr
+            body(i);
+        // PRINT: for (int ii = 0; ii < 10; ii += 2)
+        // DUMP: ForStmt
+        for (int ii = 0; ii < 10; ii += 2)
+            // PRINT: body(ii)
+            // DUMP: CallExpr
+            body(ii);
+        // PRINT: #pragma omp fuse looprange(2,2)
+        // DUMP:  OMPFuseDirective
+        // DUMP: OMPLooprangeClause
+        #pragma omp fuse looprange(2,2)
+        {
+            // PRINT: for (int j = 10; j > 0; --j)
+            // DUMP: ForStmt
+            for (int j = 10; j > 0; --j)
+                // PRINT: body(j)
+                // DUMP: CallExpr
+                body(j);
+            // PRINT: for (int jj = 10; jj > 0; --jj)
+            // DUMP: ForStmt
+            for (int jj = 10; jj > 0; --jj)
+                // PRINT: body(jj)
+                // DUMP: CallExpr
+                body(jj);
+            // PRINT: for (int k = 0; k <= 10; ++k)
+            // DUMP: ForStmt
+            for (int k = 0; k <= 10; ++k)
+                // PRINT: body(k)
+                // DUMP: CallExpr
+                body(k);
+            // PRINT: for (int kk = 0; kk <= 10; ++kk)
+            // DUMP: ForStmt
+            for (int kk = 0; kk <= 10; ++kk)
+                // PRINT: body(kk)
+                // DUMP: CallExpr
+                body(kk);
+        }
+    }
+
+}
+
+#endif
diff --git a/clang/test/OpenMP/fuse_codegen.cpp b/clang/test/OpenMP/fuse_codegen.cpp
new file mode 100644
index 0000000..742c280
--- /dev/null
+++ b/clang/test/OpenMP/fuse_codegen.cpp
@@ -0,0 +1,2328 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --include-generated-funcs --replace-value-regex "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _ --version 5
+// expected-no-diagnostics
+
+// Check code generation
+// RUN: %clang_cc1 -verify -triple x86_64-pc-linux-gnu -std=c++20 -fclang-abi-compat=latest -fopenmp -fopenmp-version=60 -emit-llvm %s -o - | FileCheck %s --check-prefix=CHECK1
+
+// Check same results after serialization round-trip
+// RUN: %clang_cc1 -verify -triple x86_64-pc-linux-gnu -std=c++20 -fclang-abi-compat=latest -fopenmp -fopenmp-version=60 -emit-pch -o %t %s
+// RUN: %clang_cc1 -verify -triple x86_64-pc-linux-gnu -std=c++20 -fclang-abi-compat=latest -fopenmp -fopenmp-version=60 -include-pch %t -emit-llvm %s -o - | FileCheck %s --check-prefix=CHECK2
+
+#ifndef HEADER
+#define HEADER
+
+//placeholder for loop body code.
+extern "C" void body(...) {}
+
+extern "C" void foo1(int start1, int end1, int step1, int start2, int end2, int step2) {
+    int i,j;
+    #pragma omp fuse
+    {
+        for(i = start1; i < end1; i += step1) body(i);
+        for(j = start2; j < end2; j += step2) body(j);
+    }
+
+}
+
+template <typename T>
+void foo2(T start, T end, T step){
+    T i,j,k;
+    #pragma omp fuse
+    {
+        for(i = start; i < end; i += step) body(i);
+        for(j = end; j > start; j -= step) body(j);
+        for(k = start+step; k < end+step; k += step) body(k);
+    }
+}
+
+extern "C" void tfoo2() {
+    foo2<int>(0, 64, 4);
+}
+
+extern "C" void foo3() {
+    double arr[256];
+    #pragma omp fuse
+    {
+        #pragma omp fuse
+        {
+            for(int i = 0; i < 128; ++i) body(i);
+            for(int j = 0; j < 256; j+=2) body(j);
+        }
+        for(int c = 42; auto &&v: arr) body(c,v);
+        for(int cc = 37; auto &&vv: arr) body(cc, vv);
+    }
+}
+
+extern "C" void foo4() {
+    double arr[256];
+
+    #pragma omp fuse looprange(2,2)
+    {
+        for(int i = 0; i < 128; ++i) body(i);
+        for(int j = 0; j < 256; j+=2) body(j);
+        for(int k = 0; k < 64; ++k) body(k);
+        for(int c = 42; auto &&v: arr) body(c,v);
+    }
+}
+
+// This exemplifies the usage of loop transformations that generate
+// more than top level canonical loop nests (e.g split, loopranged fuse...)
+extern "C" void foo5() {
+    double arr[256];
+    #pragma omp fuse looprange(2,2)
+    {
+        #pragma omp fuse looprange(2,2)
+        {
+            for(int i = 0; i < 128; ++i) body(i);
+            for(int j = 0; j < 256; j+=2) body(j);
+            for(int k = 0; k < 512; ++k) body(k);
+        }
+        for(int c = 42; auto &&v: arr) body(c,v);
+        for(int cc = 37; auto &&vv: arr) body(cc, vv);
+    }
+}
+
+
+#endif
+// CHECK1-LABEL: define dso_local void @body(
+// CHECK1-SAME: ...) #[[ATTR0:[0-9]+]] {
+// CHECK1-NEXT:  [[ENTRY:.*:]]
+// CHECK1-NEXT:    ret void
+//
+//
+// CHECK1-LABEL: define dso_local void @foo1(
+// CHECK1-SAME: i32 noundef [[START1:%.*]], i32 noundef [[END1:%.*]], i32 noundef [[STEP1:%.*]], i32 noundef [[START2:%.*]], i32 noundef [[END2:%.*]], i32 noundef [[STEP2:%.*]]) #[[ATTR0]] {
+// CHECK1-NEXT:  [[ENTRY:.*:]]
+// CHECK1-NEXT:    [[START1_ADDR:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[END1_ADDR:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[STEP1_ADDR:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[START2_ADDR:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[END2_ADDR:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[STEP2_ADDR:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[J:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTNEW_STEP:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_LB0:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_ST0:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_NI0:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_IV0:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTCAPTURE_EXPR_6:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTCAPTURE_EXPR_7:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTNEW_STEP8:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTCAPTURE_EXPR_9:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_LB1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_ST1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_NI1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_IV1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_TEMP_1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_FUSE_MAX:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_FUSE_INDEX:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    store i32 [[START1]], ptr [[START1_ADDR]], align 4
+// CHECK1-NEXT:    store i32 [[END1]], ptr [[END1_ADDR]], align 4
+// CHECK1-NEXT:    store i32 [[STEP1]], ptr [[STEP1_ADDR]], align 4
+// CHECK1-NEXT:    store i32 [[START2]], ptr [[START2_ADDR]], align 4
+// CHECK1-NEXT:    store i32 [[END2]], ptr [[END2_ADDR]], align 4
+// CHECK1-NEXT:    store i32 [[STEP2]], ptr [[STEP2_ADDR]], align 4
+// CHECK1-NEXT:    [[TMP0:%.*]] = load i32, ptr [[START1_ADDR]], align 4
+// CHECK1-NEXT:    store i32 [[TMP0]], ptr [[I]], align 4
+// CHECK1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[START1_ADDR]], align 4
+// CHECK1-NEXT:    store i32 [[TMP1]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[END1_ADDR]], align 4
+// CHECK1-NEXT:    store i32 [[TMP2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[STEP1_ADDR]], align 4
+// CHECK1-NEXT:    store i32 [[TMP3]], ptr [[DOTNEW_STEP]], align 4
+// CHECK1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK1-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT:    [[SUB:%.*]] = sub i32 [[TMP4]], [[TMP5]]
+// CHECK1-NEXT:    [[SUB3:%.*]] = sub i32 [[SUB]], 1
+// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTNEW_STEP]], align 4
+// CHECK1-NEXT:    [[ADD:%.*]] = add i32 [[SUB3]], [[TMP6]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTNEW_STEP]], align 4
+// CHECK1-NEXT:    [[DIV:%.*]] = udiv i32 [[ADD]], [[TMP7]]
+// CHECK1-NEXT:    [[SUB4:%.*]] = sub i32 [[DIV]], 1
+// CHECK1-NEXT:    store i32 [[SUB4]], ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_LB0]], align 4
+// CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_ST0]], align 4
+// CHECK1-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK1-NEXT:    [[ADD5:%.*]] = add i32 [[TMP8]], 1
+// CHECK1-NEXT:    store i32 [[ADD5]], ptr [[DOTOMP_NI0]], align 4
+// CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[START2_ADDR]], align 4
+// CHECK1-NEXT:    store i32 [[TMP9]], ptr [[J]], align 4
+// CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[START2_ADDR]], align 4
+// CHECK1-NEXT:    store i32 [[TMP10]], ptr [[DOTCAPTURE_EXPR_6]], align 4
+// CHECK1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[END2_ADDR]], align 4
+// CHECK1-NEXT:    store i32 [[TMP11]], ptr [[DOTCAPTURE_EXPR_7]], align 4
+// CHECK1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[STEP2_ADDR]], align 4
+// CHECK1-NEXT:    store i32 [[TMP12]], ptr [[DOTNEW_STEP8]], align 4
+// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_7]], align 4
+// CHECK1-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_6]], align 4
+// CHECK1-NEXT:    [[SUB10:%.*]] = sub i32 [[TMP13]], [[TMP14]]
+// CHECK1-NEXT:    [[SUB11:%.*]] = sub i32 [[SUB10]], 1
+// CHECK1-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTNEW_STEP8]], align 4
+// CHECK1-NEXT:    [[ADD12:%.*]] = add i32 [[SUB11]], [[TMP15]]
+// CHECK1-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTNEW_STEP8]], align 4
+// CHECK1-NEXT:    [[DIV13:%.*]] = udiv i32 [[ADD12]], [[TMP16]]
+// CHECK1-NEXT:    [[SUB14:%.*]] = sub i32 [[DIV13]], 1
+// CHECK1-NEXT:    store i32 [[SUB14]], ptr [[DOTCAPTURE_EXPR_9]], align 4
+// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_LB1]], align 4
+// CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_ST1]], align 4
+// CHECK1-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_9]], align 4
+// CHECK1-NEXT:    [[ADD15:%.*]] = add i32 [[TMP17]], 1
+// CHECK1-NEXT:    store i32 [[ADD15]], ptr [[DOTOMP_NI1]], align 4
+// CHECK1-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_NI0]], align 4
+// CHECK1-NEXT:    store i32 [[TMP18]], ptr [[DOTOMP_TEMP_1]], align 4
+// CHECK1-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_TEMP_1]], align 4
+// CHECK1-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_NI1]], align 4
+// CHECK1-NEXT:    [[CMP:%.*]] = icmp ugt i32 [[TMP19]], [[TMP20]]
+// CHECK1-NEXT:    br i1 [[CMP]], label %[[COND_TRUE:.*]], label %[[COND_FALSE:.*]]
+// CHECK1:       [[COND_TRUE]]:
+// CHECK1-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTOMP_TEMP_1]], align 4
+// CHECK1-NEXT:    br label %[[COND_END:.*]]
+// CHECK1:       [[COND_FALSE]]:
+// CHECK1-NEXT:    [[TMP22:%.*]] = load i32, ptr [[DOTOMP_NI1]], align 4
+// CHECK1-NEXT:    br label %[[COND_END]]
+// CHECK1:       [[COND_END]]:
+// CHECK1-NEXT:    [[COND:%.*]] = phi i32 [ [[TMP21]], %[[COND_TRUE]] ], [ [[TMP22]], %[[COND_FALSE]] ]
+// CHECK1-NEXT:    store i32 [[COND]], ptr [[DOTOMP_FUSE_MAX]], align 4
+// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK1-NEXT:    br label %[[FOR_COND:.*]]
+// CHECK1:       [[FOR_COND]]:
+// CHECK1-NEXT:    [[TMP23:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK1-NEXT:    [[TMP24:%.*]] = load i32, ptr [[DOTOMP_FUSE_MAX]], align 4
+// CHECK1-NEXT:    [[CMP16:%.*]] = icmp ult i32 [[TMP23]], [[TMP24]]
+// CHECK1-NEXT:    br i1 [[CMP16]], label %[[FOR_BODY:.*]], label %[[FOR_END:.*]]
+// CHECK1:       [[FOR_BODY]]:
+// CHECK1-NEXT:    [[TMP25:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK1-NEXT:    [[TMP26:%.*]] = load i32, ptr [[DOTOMP_NI0]], align 4
+// CHECK1-NEXT:    [[CMP17:%.*]] = icmp ult i32 [[TMP25]], [[TMP26]]
+// CHECK1-NEXT:    br i1 [[CMP17]], label %[[IF_THEN:.*]], label %[[IF_END:.*]]
+// CHECK1:       [[IF_THEN]]:
+// CHECK1-NEXT:    [[TMP27:%.*]] = load i32, ptr [[DOTOMP_LB0]], align 4
+// CHECK1-NEXT:    [[TMP28:%.*]] = load i32, ptr [[DOTOMP_ST0]], align 4
+// CHECK1-NEXT:    [[TMP29:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK1-NEXT:    [[MUL:%.*]] = mul i32 [[TMP28]], [[TMP29]]
+// CHECK1-NEXT:    [[ADD18:%.*]] = add i32 [[TMP27]], [[MUL]]
+// CHECK1-NEXT:    store i32 [[ADD18]], ptr [[DOTOMP_IV0]], align 4
+// CHECK1-NEXT:    [[TMP30:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT:    [[TMP31:%.*]] = load i32, ptr [[DOTOMP_IV0]], align 4
+// CHECK1-NEXT:    [[TMP32:%.*]] = load i32, ptr [[DOTNEW_STEP]], align 4
+// CHECK1-NEXT:    [[MUL19:%.*]] = mul i32 [[TMP31]], [[TMP32]]
+// CHECK1-NEXT:    [[ADD20:%.*]] = add i32 [[TMP30]], [[MUL19]]
+// CHECK1-NEXT:    store i32 [[ADD20]], ptr [[I]], align 4
+// CHECK1-NEXT:    [[TMP33:%.*]] = load i32, ptr [[I]], align 4
+// CHECK1-NEXT:    call void (...) @body(i32 noundef [[TMP33]])
+// CHECK1-NEXT:    br label %[[IF_END]]
+// CHECK1:       [[IF_END]]:
+// CHECK1-NEXT:    [[TMP34:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK1-NEXT:    [[TMP35:%.*]] = load i32, ptr [[DOTOMP_NI1]], align 4
+// CHECK1-NEXT:    [[CMP21:%.*]] = icmp ult i32 [[TMP34]], [[TMP35]]
+// CHECK1-NEXT:    br i1 [[CMP21]], label %[[IF_THEN22:.*]], label %[[IF_END27:.*]]
+// CHECK1:       [[IF_THEN22]]:
+// CHECK1-NEXT:    [[TMP36:%.*]] = load i32, ptr [[DOTOMP_LB1]], align 4
+// CHECK1-NEXT:    [[TMP37:%.*]] = load i32, ptr [[DOTOMP_ST1]], align 4
+// CHECK1-NEXT:    [[TMP38:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK1-NEXT:    [[MUL23:%.*]] = mul i32 [[TMP37]], [[TMP38]]
+// CHECK1-NEXT:    [[ADD24:%.*]] = add i32 [[TMP36]], [[MUL23]]
+// CHECK1-NEXT:    store i32 [[ADD24]], ptr [[DOTOMP_IV1]], align 4
+// CHECK1-NEXT:    [[TMP39:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_6]], align 4
+// CHECK1-NEXT:    [[TMP40:%.*]] = load i32, ptr [[DOTOMP_IV1]], align 4
+// CHECK1-NEXT:    [[TMP41:%.*]] = load i32, ptr [[DOTNEW_STEP8]], align 4
+// CHECK1-NEXT:    [[MUL25:%.*]] = mul i32 [[TMP40]], [[TMP41]]
+// CHECK1-NEXT:    [[ADD26:%.*]] = add i32 [[TMP39]], [[MUL25]]
+// CHECK1-NEXT:    store i32 [[ADD26]], ptr [[J]], align 4
+// CHECK1-NEXT:    [[TMP42:%.*]] = load i32, ptr [[J]], align 4
+// CHECK1-NEXT:    call void (...) @body(i32 noundef [[TMP42]])
+// CHECK1-NEXT:    br label %[[IF_END27]]
+// CHECK1:       [[IF_END27]]:
+// CHECK1-NEXT:    br label %[[FOR_INC:.*]]
+// CHECK1:       [[FOR_INC]]:
+// CHECK1-NEXT:    [[TMP43:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK1-NEXT:    [[INC:%.*]] = add i32 [[TMP43]], 1
+// CHECK1-NEXT:    store i32 [[INC]], ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK1-NEXT:    br label %[[FOR_COND]], !llvm.loop [[LOOP3:![0-9]+]]
+// CHECK1:       [[FOR_END]]:
+// CHECK1-NEXT:    ret void
+//
+//
+// CHECK1-LABEL: define dso_local void @tfoo2(
+// CHECK1-SAME: ) #[[ATTR0]] {
+// CHECK1-NEXT:  [[ENTRY:.*:]]
+// CHECK1-NEXT:    call void @_Z4foo2IiEvT_S0_S0_(i32 noundef 0, i32 noundef 64, i32 noundef 4)
+// CHECK1-NEXT:    ret void
+//
+//
+// CHECK1-LABEL: define linkonce_odr void @_Z4foo2IiEvT_S0_S0_(
+// CHECK1-SAME: i32 noundef [[START:%.*]], i32 noundef [[END:%.*]], i32 noundef [[STEP:%.*]]) #[[ATTR0]] comdat {
+// CHECK1-NEXT:  [[ENTRY:.*:]]
+// CHECK1-NEXT:    [[START_ADDR:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[END_ADDR:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[STEP_ADDR:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[J:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[K:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTNEW_STEP:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_LB0:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_ST0:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_NI0:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_IV0:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTCAPTURE_EXPR_6:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTCAPTURE_EXPR_7:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTNEW_STEP8:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTCAPTURE_EXPR_9:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_LB1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_ST1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_NI1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_IV1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTCAPTURE_EXPR_17:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTCAPTURE_EXPR_19:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTNEW_STEP21:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTCAPTURE_EXPR_22:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_LB2:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_ST2:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_NI2:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_IV2:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_TEMP_1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_TEMP_2:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_FUSE_MAX:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_FUSE_INDEX:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    store i32 [[START]], ptr [[START_ADDR]], align 4
+// CHECK1-NEXT:    store i32 [[END]], ptr [[END_ADDR]], align 4
+// CHECK1-NEXT:    store i32 [[STEP]], ptr [[STEP_ADDR]], align 4
+// CHECK1-NEXT:    [[TMP0:%.*]] = load i32, ptr [[START_ADDR]], align 4
+// CHECK1-NEXT:    store i32 [[TMP0]], ptr [[I]], align 4
+// CHECK1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[START_ADDR]], align 4
+// CHECK1-NEXT:    store i32 [[TMP1]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[END_ADDR]], align 4
+// CHECK1-NEXT:    store i32 [[TMP2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[STEP_ADDR]], align 4
+// CHECK1-NEXT:    store i32 [[TMP3]], ptr [[DOTNEW_STEP]], align 4
+// CHECK1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK1-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT:    [[SUB:%.*]] = sub i32 [[TMP4]], [[TMP5]]
+// CHECK1-NEXT:    [[SUB3:%.*]] = sub i32 [[SUB]], 1
+// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTNEW_STEP]], align 4
+// CHECK1-NEXT:    [[ADD:%.*]] = add i32 [[SUB3]], [[TMP6]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTNEW_STEP]], align 4
+// CHECK1-NEXT:    [[DIV:%.*]] = udiv i32 [[ADD]], [[TMP7]]
+// CHECK1-NEXT:    [[SUB4:%.*]] = sub i32 [[DIV]], 1
+// CHECK1-NEXT:    store i32 [[SUB4]], ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_LB0]], align 4
+// CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_ST0]], align 4
+// CHECK1-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK1-NEXT:    [[ADD5:%.*]] = add i32 [[TMP8]], 1
+// CHECK1-NEXT:    store i32 [[ADD5]], ptr [[DOTOMP_NI0]], align 4
+// CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[END_ADDR]], align 4
+// CHECK1-NEXT:    store i32 [[TMP9]], ptr [[J]], align 4
+// CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[END_ADDR]], align 4
+// CHECK1-NEXT:    store i32 [[TMP10]], ptr [[DOTCAPTURE_EXPR_6]], align 4
+// CHECK1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[START_ADDR]], align 4
+// CHECK1-NEXT:    store i32 [[TMP11]], ptr [[DOTCAPTURE_EXPR_7]], align 4
+// CHECK1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[STEP_ADDR]], align 4
+// CHECK1-NEXT:    store i32 [[TMP12]], ptr [[DOTNEW_STEP8]], align 4
+// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_6]], align 4
+// CHECK1-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_7]], align 4
+// CHECK1-NEXT:    [[SUB10:%.*]] = sub i32 [[TMP13]], [[TMP14]]
+// CHECK1-NEXT:    [[SUB11:%.*]] = sub i32 [[SUB10]], 1
+// CHECK1-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTNEW_STEP8]], align 4
+// CHECK1-NEXT:    [[ADD12:%.*]] = add i32 [[SUB11]], [[TMP15]]
+// CHECK1-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTNEW_STEP8]], align 4
+// CHECK1-NEXT:    [[DIV13:%.*]] = udiv i32 [[ADD12]], [[TMP16]]
+// CHECK1-NEXT:    [[SUB14:%.*]] = sub i32 [[DIV13]], 1
+// CHECK1-NEXT:    store i32 [[SUB14]], ptr [[DOTCAPTURE_EXPR_9]], align 4
+// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_LB1]], align 4
+// CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_ST1]], align 4
+// CHECK1-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_9]], align 4
+// CHECK1-NEXT:    [[ADD15:%.*]] = add i32 [[TMP17]], 1
+// CHECK1-NEXT:    store i32 [[ADD15]], ptr [[DOTOMP_NI1]], align 4
+// CHECK1-NEXT:    [[TMP18:%.*]] = load i32, ptr [[START_ADDR]], align 4
+// CHECK1-NEXT:    [[TMP19:%.*]] = load i32, ptr [[STEP_ADDR]], align 4
+// CHECK1-NEXT:    [[ADD16:%.*]] = add nsw i32 [[TMP18]], [[TMP19]]
+// CHECK1-NEXT:    store i32 [[ADD16]], ptr [[K]], align 4
+// CHECK1-NEXT:    [[TMP20:%.*]] = load i32, ptr [[START_ADDR]], align 4
+// CHECK1-NEXT:    [[TMP21:%.*]] = load i32, ptr [[STEP_ADDR]], align 4
+// CHECK1-NEXT:    [[ADD18:%.*]] = add nsw i32 [[TMP20]], [[TMP21]]
+// CHECK1-NEXT:    store i32 [[ADD18]], ptr [[DOTCAPTURE_EXPR_17]], align 4
+// CHECK1-NEXT:    [[TMP22:%.*]] = load i32, ptr [[END_ADDR]], align 4
+// CHECK1-NEXT:    [[TMP23:%.*]] = load i32, ptr [[STEP_ADDR]], align 4
+// CHECK1-NEXT:    [[ADD20:%.*]] = add nsw i32 [[TMP22]], [[TMP23]]
+// CHECK1-NEXT:    store i32 [[ADD20]], ptr [[DOTCAPTURE_EXPR_19]], align 4
+// CHECK1-NEXT:    [[TMP24:%.*]] = load i32, ptr [[STEP_ADDR]], align 4
+// CHECK1-NEXT:    store i32 [[TMP24]], ptr [[DOTNEW_STEP21]], align 4
+// CHECK1-NEXT:    [[TMP25:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_19]], align 4
+// CHECK1-NEXT:    [[TMP26:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_17]], align 4
+// CHECK1-NEXT:    [[SUB23:%.*]] = sub i32 [[TMP25]], [[TMP26]]
+// CHECK1-NEXT:    [[SUB24:%.*]] = sub i32 [[SUB23]], 1
+// CHECK1-NEXT:    [[TMP27:%.*]] = load i32, ptr [[DOTNEW_STEP21]], align 4
+// CHECK1-NEXT:    [[ADD25:%.*]] = add i32 [[SUB24]], [[TMP27]]
+// CHECK1-NEXT:    [[TMP28:%.*]] = load i32, ptr [[DOTNEW_STEP21]], align 4
+// CHECK1-NEXT:    [[DIV26:%.*]] = udiv i32 [[ADD25]], [[TMP28]]
+// CHECK1-NEXT:    [[SUB27:%.*]] = sub i32 [[DIV26]], 1
+// CHECK1-NEXT:    store i32 [[SUB27]], ptr [[DOTCAPTURE_EXPR_22]], align 4
+// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_LB2]], align 4
+// CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_ST2]], align 4
+// CHECK1-NEXT:    [[TMP29:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_22]], align 4
+// CHECK1-NEXT:    [[ADD28:%.*]] = add i32 [[TMP29]], 1
+// CHECK1-NEXT:    store i32 [[ADD28]], ptr [[DOTOMP_NI2]], align 4
+// CHECK1-NEXT:    [[TMP30:%.*]] = load i32, ptr [[DOTOMP_NI0]], align 4
+// CHECK1-NEXT:    store i32 [[TMP30]], ptr [[DOTOMP_TEMP_1]], align 4
+// CHECK1-NEXT:    [[TMP31:%.*]] = load i32, ptr [[DOTOMP_TEMP_1]], align 4
+// CHECK1-NEXT:    [[TMP32:%.*]] = load i32, ptr [[DOTOMP_NI1]], align 4
+// CHECK1-NEXT:    [[CMP:%.*]] = icmp ugt i32 [[TMP31]], [[TMP32]]
+// CHECK1-NEXT:    br i1 [[CMP]], label %[[COND_TRUE:.*]], label %[[COND_FALSE:.*]]
+// CHECK1:       [[COND_TRUE]]:
+// CHECK1-NEXT:    [[TMP33:%.*]] = load i32, ptr [[DOTOMP_TEMP_1]], align 4
+// CHECK1-NEXT:    br label %[[COND_END:.*]]
+// CHECK1:       [[COND_FALSE]]:
+// CHECK1-NEXT:    [[TMP34:%.*]] = load i32, ptr [[DOTOMP_NI1]], align 4
+// CHECK1-NEXT:    br label %[[COND_END]]
+// CHECK1:       [[COND_END]]:
+// CHECK1-NEXT:    [[COND:%.*]] = phi i32 [ [[TMP33]], %[[COND_TRUE]] ], [ [[TMP34]], %[[COND_FALSE]] ]
+// CHECK1-NEXT:    store i32 [[COND]], ptr [[DOTOMP_TEMP_2]], align 4
+// CHECK1-NEXT:    [[TMP35:%.*]] = load i32, ptr [[DOTOMP_TEMP_2]], align 4
+// CHECK1-NEXT:    [[TMP36:%.*]] = load i32, ptr [[DOTOMP_NI2]], align 4
+// CHECK1-NEXT:    [[CMP29:%.*]] = icmp ugt i32 [[TMP35]], [[TMP36]]
+// CHECK1-NEXT:    br i1 [[CMP29]], label %[[COND_TRUE30:.*]], label %[[COND_FALSE31:.*]]
+// CHECK1:       [[COND_TRUE30]]:
+// CHECK1-NEXT:    [[TMP37:%.*]] = load i32, ptr [[DOTOMP_TEMP_2]], align 4
+// CHECK1-NEXT:    br label %[[COND_END32:.*]]
+// CHECK1:       [[COND_FALSE31]]:
+// CHECK1-NEXT:    [[TMP38:%.*]] = load i32, ptr [[DOTOMP_NI2]], align 4
+// CHECK1-NEXT:    br label %[[COND_END32]]
+// CHECK1:       [[COND_END32]]:
+// CHECK1-NEXT:    [[COND33:%.*]] = phi i32 [ [[TMP37]], %[[COND_TRUE30]] ], [ [[TMP38]], %[[COND_FALSE31]] ]
+// CHECK1-NEXT:    store i32 [[COND33]], ptr [[DOTOMP_FUSE_MAX]], align 4
+// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK1-NEXT:    br label %[[FOR_COND:.*]]
+// CHECK1:       [[FOR_COND]]:
+// CHECK1-NEXT:    [[TMP39:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK1-NEXT:    [[TMP40:%.*]] = load i32, ptr [[DOTOMP_FUSE_MAX]], align 4
+// CHECK1-NEXT:    [[CMP34:%.*]] = icmp ult i32 [[TMP39]], [[TMP40]]
+// CHECK1-NEXT:    br i1 [[CMP34]], label %[[FOR_BODY:.*]], label %[[FOR_END:.*]]
+// CHECK1:       [[FOR_BODY]]:
+// CHECK1-NEXT:    [[TMP41:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK1-NEXT:    [[TMP42:%.*]] = load i32, ptr [[DOTOMP_NI0]], align 4
+// CHECK1-NEXT:    [[CMP35:%.*]] = icmp ult i32 [[TMP41]], [[TMP42]]
+// CHECK1-NEXT:    br i1 [[CMP35]], label %[[IF_THEN:.*]], label %[[IF_END:.*]]
+// CHECK1:       [[IF_THEN]]:
+// CHECK1-NEXT:    [[TMP43:%.*]] = load i32, ptr [[DOTOMP_LB0]], align 4
+// CHECK1-NEXT:    [[TMP44:%.*]] = load i32, ptr [[DOTOMP_ST0]], align 4
+// CHECK1-NEXT:    [[TMP45:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK1-NEXT:    [[MUL:%.*]] = mul i32 [[TMP44]], [[TMP45]]
+// CHECK1-NEXT:    [[ADD36:%.*]] = add i32 [[TMP43]], [[MUL]]
+// CHECK1-NEXT:    store i32 [[ADD36]], ptr [[DOTOMP_IV0]], align 4
+// CHECK1-NEXT:    [[TMP46:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT:    [[TMP47:%.*]] = load i32, ptr [[DOTOMP_IV0]], align 4
+// CHECK1-NEXT:    [[TMP48:%.*]] = load i32, ptr [[DOTNEW_STEP]], align 4
+// CHECK1-NEXT:    [[MUL37:%.*]] = mul i32 [[TMP47]], [[TMP48]]
+// CHECK1-NEXT:    [[ADD38:%.*]] = add i32 [[TMP46]], [[MUL37]]
+// CHECK1-NEXT:    store i32 [[ADD38]], ptr [[I]], align 4
+// CHECK1-NEXT:    [[TMP49:%.*]] = load i32, ptr [[I]], align 4
+// CHECK1-NEXT:    call void (...) @body(i32 noundef [[TMP49]])
+// CHECK1-NEXT:    br label %[[IF_END]]
+// CHECK1:       [[IF_END]]:
+// CHECK1-NEXT:    [[TMP50:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK1-NEXT:    [[TMP51:%.*]] = load i32, ptr [[DOTOMP_NI1]], align 4
+// CHECK1-NEXT:    [[CMP39:%.*]] = icmp ult i32 [[TMP50]], [[TMP51]]
+// CHECK1-NEXT:    br i1 [[CMP39]], label %[[IF_THEN40:.*]], label %[[IF_END45:.*]]
+// CHECK1:       [[IF_THEN40]]:
+// CHECK1-NEXT:    [[TMP52:%.*]] = load i32, ptr [[DOTOMP_LB1]], align 4
+// CHECK1-NEXT:    [[TMP53:%.*]] = load i32, ptr [[DOTOMP_ST1]], align 4
+// CHECK1-NEXT:    [[TMP54:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK1-NEXT:    [[MUL41:%.*]] = mul i32 [[TMP53]], [[TMP54]]
+// CHECK1-NEXT:    [[ADD42:%.*]] = add i32 [[TMP52]], [[MUL41]]
+// CHECK1-NEXT:    store i32 [[ADD42]], ptr [[DOTOMP_IV1]], align 4
+// CHECK1-NEXT:    [[TMP55:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_6]], align 4
+// CHECK1-NEXT:    [[TMP56:%.*]] = load i32, ptr [[DOTOMP_IV1]], align 4
+// CHECK1-NEXT:    [[TMP57:%.*]] = load i32, ptr [[DOTNEW_STEP8]], align 4
+// CHECK1-NEXT:    [[MUL43:%.*]] = mul i32 [[TMP56]], [[TMP57]]
+// CHECK1-NEXT:    [[SUB44:%.*]] = sub i32 [[TMP55]], [[MUL43]]
+// CHECK1-NEXT:    store i32 [[SUB44]], ptr [[J]], align 4
+// CHECK1-NEXT:    [[TMP58:%.*]] = load i32, ptr [[J]], align 4
+// CHECK1-NEXT:    call void (...) @body(i32 noundef [[TMP58]])
+// CHECK1-NEXT:    br label %[[IF_END45]]
+// CHECK1:       [[IF_END45]]:
+// CHECK1-NEXT:    [[TMP59:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK1-NEXT:    [[TMP60:%.*]] = load i32, ptr [[DOTOMP_NI2]], align 4
+// CHECK1-NEXT:    [[CMP46:%.*]] = icmp ult i32 [[TMP59]], [[TMP60]]
+// CHECK1-NEXT:    br i1 [[CMP46]], label %[[IF_THEN47:.*]], label %[[IF_END52:.*]]
+// CHECK1:       [[IF_THEN47]]:
+// CHECK1-NEXT:    [[TMP61:%.*]] = load i32, ptr [[DOTOMP_LB2]], align 4
+// CHECK1-NEXT:    [[TMP62:%.*]] = load i32, ptr [[DOTOMP_ST2]], align 4
+// CHECK1-NEXT:    [[TMP63:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK1-NEXT:    [[MUL48:%.*]] = mul i32 [[TMP62]], [[TMP63]]
+// CHECK1-NEXT:    [[ADD49:%.*]] = add i32 [[TMP61]], [[MUL48]]
+// CHECK1-NEXT:    store i32 [[ADD49]], ptr [[DOTOMP_IV2]], align 4
+// CHECK1-NEXT:    [[TMP64:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_17]], align 4
+// CHECK1-NEXT:    [[TMP65:%.*]] = load i32, ptr [[DOTOMP_IV2]], align 4
+// CHECK1-NEXT:    [[TMP66:%.*]] = load i32, ptr [[DOTNEW_STEP21]], align 4
+// CHECK1-NEXT:    [[MUL50:%.*]] = mul i32 [[TMP65]], [[TMP66]]
+// CHECK1-NEXT:    [[ADD51:%.*]] = add i32 [[TMP64]], [[MUL50]]
+// CHECK1-NEXT:    store i32 [[ADD51]], ptr [[K]], align 4
+// CHECK1-NEXT:    [[TMP67:%.*]] = load i32, ptr [[K]], align 4
+// CHECK1-NEXT:    call void (...) @body(i32 noundef [[TMP67]])
+// CHECK1-NEXT:    br label %[[IF_END52]]
+// CHECK1:       [[IF_END52]]:
+// CHECK1-NEXT:    br label %[[FOR_INC:.*]]
+// CHECK1:       [[FOR_INC]]:
+// CHECK1-NEXT:    [[TMP68:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK1-NEXT:    [[INC:%.*]] = add i32 [[TMP68]], 1
+// CHECK1-NEXT:    store i32 [[INC]], ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK1-NEXT:    br label %[[FOR_COND]], !llvm.loop [[LOOP5:![0-9]+]]
+// CHECK1:       [[FOR_END]]:
+// CHECK1-NEXT:    ret void
+//
+//
+// CHECK1-LABEL: define dso_local void @foo3(
+// CHECK1-SAME: ) #[[ATTR0]] {
+// CHECK1-NEXT:  [[ENTRY:.*:]]
+// CHECK1-NEXT:    [[ARR:%.*]] = alloca [256 x double], align 16
+// CHECK1-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_LB0:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_ST0:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_NI0:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_IV0:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[J:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_LB1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_ST1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_NI1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_IV1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_TEMP_1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_FUSE_MAX:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_FUSE_INDEX:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_LB03:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_ST04:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_NI05:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[DOTOMP_IV06:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[C:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[__RANGE2:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[__END2:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[__BEGIN2:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[DOTCAPTURE_EXPR_8:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[DOTCAPTURE_EXPR_10:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[DOTCAPTURE_EXPR_11:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[DOTOMP_LB116:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[DOTOMP_ST117:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[DOTOMP_NI118:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[DOTOMP_IV120:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[CC:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[__RANGE221:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[__END222:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[__BEGIN225:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[DOTCAPTURE_EXPR_27:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[DOTCAPTURE_EXPR_29:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[DOTCAPTURE_EXPR_30:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[DOTOMP_LB2:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[DOTOMP_ST2:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[DOTOMP_NI2:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[DOTOMP_IV2:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[DOTOMP_TEMP_140:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[DOTOMP_TEMP_2:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[DOTOMP_FUSE_MAX46:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[DOTOMP_FUSE_INDEX52:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[V:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[VV:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    store i32 0, ptr [[I]], align 4
+// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_LB0]], align 4
+// CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_ST0]], align 4
+// CHECK1-NEXT:    store i32 128, ptr [[DOTOMP_NI0]], align 4
+// CHECK1-NEXT:    store i32 0, ptr [[J]], align 4
+// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_LB1]], align 4
+// CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_ST1]], align 4
+// CHECK1-NEXT:    store i32 128, ptr [[DOTOMP_NI1]], align 4
+// CHECK1-NEXT:    [[TMP0:%.*]] = load i32, ptr [[DOTOMP_NI0]], align 4
+// CHECK1-NEXT:    store i32 [[TMP0]], ptr [[DOTOMP_TEMP_1]], align 4
+// CHECK1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTOMP_TEMP_1]], align 4
+// CHECK1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTOMP_NI1]], align 4
+// CHECK1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP1]], [[TMP2]]
+// CHECK1-NEXT:    br i1 [[CMP]], label %[[COND_TRUE:.*]], label %[[COND_FALSE:.*]]
+// CHECK1:       [[COND_TRUE]]:
+// CHECK1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTOMP_TEMP_1]], align 4
+// CHECK1-NEXT:    br label %[[COND_END:.*]]
+// CHECK1:       [[COND_FALSE]]:
+// CHECK1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_NI1]], align 4
+// CHECK1-NEXT:    br label %[[COND_END]]
+// CHECK1:       [[COND_END]]:
+// CHECK1-NEXT:    [[COND:%.*]] = phi i32 [ [[TMP3]], %[[COND_TRUE]] ], [ [[TMP4]], %[[COND_FALSE]] ]
+// CHECK1-NEXT:    store i32 [[COND]], ptr [[DOTOMP_FUSE_MAX]], align 4
+// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK1-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_FUSE_MAX]], align 4
+// CHECK1-NEXT:    store i32 [[TMP5]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT:    [[SUB:%.*]] = sub nsw i32 [[TMP6]], 0
+// CHECK1-NEXT:    [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK1-NEXT:    [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1
+// CHECK1-NEXT:    store i32 [[SUB2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_LB03]], align 4
+// CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_ST04]], align 4
+// CHECK1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP7]], 1
+// CHECK1-NEXT:    [[CONV:%.*]] = sext i32 [[ADD]] to i64
+// CHECK1-NEXT:    store i64 [[CONV]], ptr [[DOTOMP_NI05]], align 8
+// CHECK1-NEXT:    store i32 42, ptr [[C]], align 4
+// CHECK1-NEXT:    store ptr [[ARR]], ptr [[__RANGE2]], align 8
+// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[__RANGE2]], align 8
+// CHECK1-NEXT:    [[ARRAYDECAY:%.*]] = getelementptr inbounds [256 x double], ptr [[TMP8]], i64 0, i64 0
+// CHECK1-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds double, ptr [[ARRAYDECAY]], i64 256
+// CHECK1-NEXT:    store ptr [[ADD_PTR]], ptr [[__END2]], align 8
+// CHECK1-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[__RANGE2]], align 8
+// CHECK1-NEXT:    [[ARRAYDECAY7:%.*]] = getelementptr inbounds [256 x double], ptr [[TMP9]], i64 0, i64 0
+// CHECK1-NEXT:    store ptr [[ARRAYDECAY7]], ptr [[__BEGIN2]], align 8
+// CHECK1-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[__RANGE2]], align 8
+// CHECK1-NEXT:    [[ARRAYDECAY9:%.*]] = getelementptr inbounds [256 x double], ptr [[TMP10]], i64 0, i64 0
+// CHECK1-NEXT:    store ptr [[ARRAYDECAY9]], ptr [[DOTCAPTURE_EXPR_8]], align 8
+// CHECK1-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[__END2]], align 8
+// CHECK1-NEXT:    store ptr [[TMP11]], ptr [[DOTCAPTURE_EXPR_10]], align 8
+// CHECK1-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[DOTCAPTURE_EXPR_10]], align 8
+// CHECK1-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[DOTCAPTURE_EXPR_8]], align 8
+// CHECK1-NEXT:    [[SUB_PTR_LHS_CAST:%.*]] = ptrtoint ptr [[TMP12]] to i64
+// CHECK1-NEXT:    [[SUB_PTR_RHS_CAST:%.*]] = ptrtoint ptr [[TMP13]] to i64
+// CHECK1-NEXT:    [[SUB_PTR_SUB:%.*]] = sub i64 [[SUB_PTR_LHS_CAST]], [[SUB_PTR_RHS_CAST]]
+// CHECK1-NEXT:    [[SUB_PTR_DIV:%.*]] = sdiv exact i64 [[SUB_PTR_SUB]], 8
+// CHECK1-NEXT:    [[SUB12:%.*]] = sub nsw i64 [[SUB_PTR_DIV]], 1
+// CHECK1-NEXT:    [[ADD13:%.*]] = add nsw i64 [[SUB12]], 1
+// CHECK1-NEXT:    [[DIV14:%.*]] = sdiv i64 [[ADD13]], 1
+// CHECK1-NEXT:    [[SUB15:%.*]] = sub nsw i64 [[DIV14]], 1
+// CHECK1-NEXT:    store i64 [[SUB15]], ptr [[DOTCAPTURE_EXPR_11]], align 8
+// CHECK1-NEXT:    store i64 0, ptr [[DOTOMP_LB116]], align 8
+// CHECK1-NEXT:    store i64 1, ptr [[DOTOMP_ST117]], align 8
+// CHECK1-NEXT:    [[TMP14:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_11]], align 8
+// CHECK1-NEXT:    [[ADD19:%.*]] = add nsw i64 [[TMP14]], 1
+// CHECK1-NEXT:    store i64 [[ADD19]], ptr [[DOTOMP_NI118]], align 8
+// CHECK1-NEXT:    store i32 37, ptr [[CC]], align 4
+// CHECK1-NEXT:    store ptr [[ARR]], ptr [[__RANGE221]], align 8
+// CHECK1-NEXT:    [[TMP15:%.*]] = load ptr, ptr [[__RANGE221]], align 8
+// CHECK1-NEXT:    [[ARRAYDECAY23:%.*]] = getelementptr inbounds [256 x double], ptr [[TMP15]], i64 0, i64 0
+// CHECK1-NEXT:    [[ADD_PTR24:%.*]] = getelementptr inbounds double, ptr [[ARRAYDECAY23]], i64 256
+// CHECK1-NEXT:    store ptr [[ADD_PTR24]], ptr [[__END222]], align 8
+// CHECK1-NEXT:    [[TMP16:%.*]] = load ptr, ptr [[__RANGE221]], align 8
+// CHECK1-NEXT:    [[ARRAYDECAY26:%.*]] = getelementptr inbounds [256 x double], ptr [[TMP16]], i64 0, i64 0
+// CHECK1-NEXT:    store ptr [[ARRAYDECAY26]], ptr [[__BEGIN225]], align 8
+// CHECK1-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[__RANGE221]], align 8
+// CHECK1-NEXT:    [[ARRAYDECAY28:%.*]] = getelementptr inbounds [256 x double], ptr [[TMP17]], i64 0, i64 0
+// CHECK1-NEXT:    store ptr [[ARRAYDECAY28]], ptr [[DOTCAPTURE_EXPR_27]], align 8
+// CHECK1-NEXT:    [[TMP18:%.*]] = load ptr, ptr [[__END222]], align 8
+// CHECK1-NEXT:    store ptr [[TMP18]], ptr [[DOTCAPTURE_EXPR_29]], align 8
+// CHECK1-NEXT:    [[TMP19:%.*]] = load ptr, ptr [[DOTCAPTURE_EXPR_29]], align 8
+// CHECK1-NEXT:    [[TMP20:%.*]] = load ptr, ptr [[DOTCAPTURE_EXPR_27]], align 8
+// CHECK1-NEXT:    [[SUB_PTR_LHS_CAST31:%.*]] = ptrtoint ptr [[TMP19]] to i64
+// CHECK1-NEXT:    [[SUB_PTR_RHS_CAST32:%.*]] = ptrtoint ptr [[TMP20]] to i64
+// CHECK1-NEXT:    [[SUB_PTR_SUB33:%.*]] = sub i64 [[SUB_PTR_LHS_CAST31]], [[SUB_PTR_RHS_CAST32]]
+// CHECK1-NEXT:    [[SUB_PTR_DIV34:%.*]] = sdiv exact i64 [[SUB_PTR_SUB33]], 8
+// CHECK1-NEXT:    [[SUB35:%.*]] = sub nsw i64 [[SUB_PTR_DIV34]], 1
+// CHECK1-NEXT:    [[ADD36:%.*]] = add nsw i64 [[SUB35]], 1
+// CHECK1-NEXT:    [[DIV37:%.*]] = sdiv i64 [[ADD36]], 1
+// CHECK1-NEXT:    [[SUB38:%.*]] = sub nsw i64 [[DIV37]], 1
+// CHECK1-NEXT:    store i64 [[SUB38]], ptr [[DOTCAPTURE_EXPR_30]], align 8
+// CHECK1-NEXT:    store i64 0, ptr [[DOTOMP_LB2]], align 8
+// CHECK1-NEXT:    store i64 1, ptr [[DOTOMP_ST2]], align 8
+// CHECK1-NEXT:    [[TMP21:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_30]], align 8
+// CHECK1-NEXT:    [[ADD39:%.*]] = add nsw i64 [[TMP21]], 1
+// CHECK1-NEXT:    store i64 [[ADD39]], ptr [[DOTOMP_NI2]], align 8
+// CHECK1-NEXT:    [[TMP22:%.*]] = load i64, ptr [[DOTOMP_NI05]], align 8
+// CHECK1-NEXT:    store i64 [[TMP22]], ptr [[DOTOMP_TEMP_140]], align 8
+// CHECK1-NEXT:    [[TMP23:%.*]] = load i64, ptr [[DOTOMP_TEMP_140]], align 8
+// CHECK1-NEXT:    [[TMP24:%.*]] = load i64, ptr [[DOTOMP_NI118]], align 8
+// CHECK1-NEXT:    [[CMP41:%.*]] = icmp sgt i64 [[TMP23]], [[TMP24]]
+// CHECK1-NEXT:    br i1 [[CMP41]], label %[[COND_TRUE42:.*]], label %[[COND_FALSE43:.*]]
+// CHECK1:       [[COND_TRUE42]]:
+// CHECK1-NEXT:    [[TMP25:%.*]] = load i64, ptr [[DOTOMP_TEMP_140]], align 8
+// CHECK1-NEXT:    br label %[[COND_END44:.*]]
+// CHECK1:       [[COND_FALSE43]]:
+// CHECK1-NEXT:    [[TMP26:%.*]] = load i64, ptr [[DOTOMP_NI118]], align 8
+// CHECK1-NEXT:    br label %[[COND_END44]]
+// CHECK1:       [[COND_END44]]:
+// CHECK1-NEXT:    [[COND45:%.*]] = phi i64 [ [[TMP25]], %[[COND_TRUE42]] ], [ [[TMP26]], %[[COND_FALSE43]] ]
+// CHECK1-NEXT:    store i64 [[COND45]], ptr [[DOTOMP_TEMP_2]], align 8
+// CHECK1-NEXT:    [[TMP27:%.*]] = load i64, ptr [[DOTOMP_TEMP_2]], align 8
+// CHECK1-NEXT:    [[TMP28:%.*]] = load i64, ptr [[DOTOMP_NI2]], align 8
+// CHECK1-NEXT:    [[CMP47:%.*]] = icmp sgt i64 [[TMP27]], [[TMP28]]
+// CHECK1-NEXT:    br i1 [[CMP47]], label %[[COND_TRUE48:.*]], label %[[COND_FALSE49:.*]]
+// CHECK1:       [[COND_TRUE48]]:
+// CHECK1-NEXT:    [[TMP29:%.*]] = load i64, ptr [[DOTOMP_TEMP_2]], align 8
+// CHECK1-NEXT:    br label %[[COND_END50:.*]]
+// CHECK1:       [[COND_FALSE49]]:
+// CHECK1-NEXT:    [[TMP30:%.*]] = load i64, ptr [[DOTOMP_NI2]], align 8
+// CHECK1-NEXT:    br label %[[COND_END50]]
+// CHECK1:       [[COND_END50]]:
+// CHECK1-NEXT:    [[COND51:%.*]] = phi i64 [ [[TMP29]], %[[COND_TRUE48]] ], [ [[TMP30]], %[[COND_FALSE49]] ]
+// CHECK1-NEXT:    store i64 [[COND51]], ptr [[DOTOMP_FUSE_MAX46]], align 8
+// CHECK1-NEXT:    store i64 0, ptr [[DOTOMP_FUSE_INDEX52]], align 8
+// CHECK1-NEXT:    br label %[[FOR_COND:.*]]
+// CHECK1:       [[FOR_COND]]:
+// CHECK1-NEXT:    [[TMP31:%.*]] = load i64, ptr [[DOTOMP_FUSE_INDEX52]], align 8
+// CHECK1-NEXT:    [[TMP32:%.*]] = load i64, ptr [[DOTOMP_FUSE_MAX46]], align 8
+// CHECK1-NEXT:    [[CMP53:%.*]] = icmp slt i64 [[TMP31]], [[TMP32]]
+// CHECK1-NEXT:    br i1 [[CMP53]], label %[[FOR_BODY:.*]], label %[[FOR_END:.*]]
+// CHECK1:       [[FOR_BODY]]:
+// CHECK1-NEXT:    [[TMP33:%.*]] = load i64, ptr [[DOTOMP_FUSE_INDEX52]], align 8
+// CHECK1-NEXT:    [[TMP34:%.*]] = load i64, ptr [[DOTOMP_NI05]], align 8
+// CHECK1-NEXT:    [[CMP54:%.*]] = icmp slt i64 [[TMP33]], [[TMP34]]
+// CHECK1-NEXT:    br i1 [[CMP54]], label %[[IF_THEN:.*]], label %[[IF_END74:.*]]
+// CHECK1:       [[IF_THEN]]:
+// CHECK1-NEXT:    [[TMP35:%.*]] = load i32, ptr [[DOTOMP_LB03]], align 4
+// CHECK1-NEXT:    [[CONV55:%.*]] = sext i32 [[TMP35]] to i64
+// CHECK1-NEXT:    [[TMP36:%.*]] = load i32, ptr [[DOTOMP_ST04]], align 4
+// CHECK1-NEXT:    [[CONV56:%.*]] = sext i32 [[TMP36]] to i64
+// CHECK1-NEXT:    [[TMP37:%.*]] = load i64, ptr [[DOTOMP_FUSE_INDEX52]], align 8
+// CHECK1-NEXT:    [[MUL:%.*]] = mul nsw i64 [[CONV56]], [[TMP37]]
+// CHECK1-NEXT:    [[ADD57:%.*]] = add nsw i64 [[CONV55]], [[MUL]]
+// CHECK1-NEXT:    [[CONV58:%.*]] = trunc i64 [[ADD57]] to i32
+// CHECK1-NEXT:    store i32 [[CONV58]], ptr [[DOTOMP_IV06]], align 4
+// CHECK1-NEXT:    [[TMP38:%.*]] = load i32, ptr [[DOTOMP_IV06]], align 4
+// CHECK1-NEXT:    [[MUL59:%.*]] = mul nsw i32 [[TMP38]], 1
+// CHECK1-NEXT:    [[ADD60:%.*]] = add nsw i32 0, [[MUL59]]
+// CHECK1-NEXT:    store i32 [[ADD60]], ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK1-NEXT:    [[TMP39:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK1-NEXT:    [[TMP40:%.*]] = load i32, ptr [[DOTOMP_NI0]], align 4
+// CHECK1-NEXT:    [[CMP61:%.*]] = icmp slt i32 [[TMP39]], [[TMP40]]
+// CHECK1-NEXT:    br i1 [[CMP61]], label %[[IF_THEN62:.*]], label %[[IF_END:.*]]
+// CHECK1:       [[IF_THEN62]]:
+// CHECK1-NEXT:    [[TMP41:%.*]] = load i32, ptr [[DOTOMP_LB0]], align 4
+// CHECK1-NEXT:    [[TMP42:%.*]] = load i32, ptr [[DOTOMP_ST0]], align 4
+// CHECK1-NEXT:    [[TMP43:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK1-NEXT:    [[MUL63:%.*]] = mul nsw i32 [[TMP42]], [[TMP43]]
+// CHECK1-NEXT:    [[ADD64:%.*]] = add nsw i32 [[TMP41]], [[MUL63]]
+// CHECK1-NEXT:    store i32 [[ADD64]], ptr [[DOTOMP_IV0]], align 4
+// CHECK1-NEXT:    [[TMP44:%.*]] = load i32, ptr [[DOTOMP_IV0]], align 4
+// CHECK1-NEXT:    [[MUL65:%.*]] = mul nsw i32 [[TMP44]], 1
+// CHECK1-NEXT:    [[ADD66:%.*]] = add nsw i32 0, [[MUL65]]
+// CHECK1-NEXT:    store i32 [[ADD66]], ptr [[I]], align 4
+// CHECK1-NEXT:    [[TMP45:%.*]] = load i32, ptr [[I]], align 4
+// CHECK1-NEXT:    call void (...) @body(i32 noundef [[TMP45]])
+// CHECK1-NEXT:    br label %[[IF_END]]
+// CHECK1:       [[IF_END]]:
+// CHECK1-NEXT:    [[TMP46:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK1-NEXT:    [[TMP47:%.*]] = load i32, ptr [[DOTOMP_NI1]], align 4
+// CHECK1-NEXT:    [[CMP67:%.*]] = icmp slt i32 [[TMP46]], [[TMP47]]
+// CHECK1-NEXT:    br i1 [[CMP67]], label %[[IF_THEN68:.*]], label %[[IF_END73:.*]]
+// CHECK1:       [[IF_THEN68]]:
+// CHECK1-NEXT:    [[TMP48:%.*]] = load i32, ptr [[DOTOMP_LB1]], align 4
+// CHECK1-NEXT:    [[TMP49:%.*]] = load i32, ptr [[DOTOMP_ST1]], align 4
+// CHECK1-NEXT:    [[TMP50:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK1-NEXT:    [[MUL69:%.*]] = mul nsw i32 [[TMP49]], [[TMP50]]
+// CHECK1-NEXT:    [[ADD70:%.*]] = add nsw i32 [[TMP48]], [[MUL69]]
+// CHECK1-NEXT:    store i32 [[ADD70]], ptr [[DOTOMP_IV1]], align 4
+// CHECK1-NEXT:    [[TMP51:%.*]] = load i32, ptr [[DOTOMP_IV1]], align 4
+// CHECK1-NEXT:    [[MUL71:%.*]] = mul nsw i32 [[TMP51]], 2
+// CHECK1-NEXT:    [[ADD72:%.*]] = add nsw i32 0, [[MUL71]]
+// CHECK1-NEXT:    store i32 [[ADD72]], ptr [[J]], align 4
+// CHECK1-NEXT:    [[TMP52:%.*]] = load i32, ptr [[J]], align 4
+// CHECK1-NEXT:    call void (...) @body(i32 noundef [[TMP52]])
+// CHECK1-NEXT:    br label %[[IF_END73]]
+// CHECK1:       [[IF_END73]]:
+// CHECK1-NEXT:    br label %[[IF_END74]]
+// CHECK1:       [[IF_END74]]:
+// CHECK1-NEXT:    [[TMP53:%.*]] = load i64, ptr [[DOTOMP_FUSE_INDEX52]], align 8
+// CHECK1-NEXT:    [[TMP54:%.*]] = load i64, ptr [[DOTOMP_NI118]], align 8
+// CHECK1-NEXT:    [[CMP75:%.*]] = icmp slt i64 [[TMP53]], [[TMP54]]
+// CHECK1-NEXT:    br i1 [[CMP75]], label %[[IF_THEN76:.*]], label %[[IF_END81:.*]]
+// CHECK1:       [[IF_THEN76]]:
+// CHECK1-NEXT:    [[TMP55:%.*]] = load i64, ptr [[DOTOMP_LB116]], align 8
+// CHECK1-NEXT:    [[TMP56:%.*]] = load i64, ptr [[DOTOMP_ST117]], align 8
+// CHECK1-NEXT:    [[TMP57:%.*]] = load i64, ptr [[DOTOMP_FUSE_INDEX52]], align 8
+// CHECK1-NEXT:    [[MUL77:%.*]] = mul nsw i64 [[TMP56]], [[TMP57]]
+// CHECK1-NEXT:    [[ADD78:%.*]] = add nsw i64 [[TMP55]], [[MUL77]]
+// CHECK1-NEXT:    store i64 [[ADD78]], ptr [[DOTOMP_IV120]], align 8
+// CHECK1-NEXT:    [[TMP58:%.*]] = load ptr, ptr [[DOTCAPTURE_EXPR_8]], align 8
+// CHECK1-NEXT:    [[TMP59:%.*]] = load i64, ptr [[DOTOMP_IV120]], align 8
+// CHECK1-NEXT:    [[MUL79:%.*]] = mul nsw i64 [[TMP59]], 1
+// CHECK1-NEXT:    [[ADD_PTR80:%.*]] = getelementptr inbounds double, ptr [[TMP58]], i64 [[MUL79]]
+// CHECK1-NEXT:    store ptr [[ADD_PTR80]], ptr [[__BEGIN2]], align 8
+// CHECK1-NEXT:    [[TMP60:%.*]] = load ptr, ptr [[__BEGIN2]], align 8
+// CHECK1-NEXT:    store ptr [[TMP60]], ptr [[V]], align 8
+// CHECK1-NEXT:    [[TMP61:%.*]] = load i32, ptr [[C]], align 4
+// CHECK1-NEXT:    [[TMP62:%.*]] = load ptr, ptr [[V]], align 8
+// CHECK1-NEXT:    [[TMP63:%.*]] = load double, ptr [[TMP62]], align 8
+// CHECK1-NEXT:    call void (...) @body(i32 noundef [[TMP61]], double noundef [[TMP63]])
+// CHECK1-NEXT:    br label %[[IF_END81]]
+// CHECK1:       [[IF_END81]]:
+// CHECK1-NEXT:    [[TMP64:%.*]] = load i64, ptr [[DOTOMP_FUSE_INDEX52]], align 8
+// CHECK1-NEXT:    [[TMP65:%.*]] = load i64, ptr [[DOTOMP_NI2]], align 8
+// CHECK1-NEXT:    [[CMP82:%.*]] = icmp slt i64 [[TMP64]], [[TMP65]]
+// CHECK1-NEXT:    br i1 [[CMP82]], label %[[IF_THEN83:.*]], label %[[IF_END88:.*]]
+// CHECK1:       [[IF_THEN83]]:
+// CHECK1-NEXT:    [[TMP66:%.*]] = load i64, ptr [[DOTOMP_LB2]], align 8
+// CHECK1-NEXT:    [[TMP67:%.*]] = load i64, ptr [[DOTOMP_ST2]], align 8
+// CHECK1-NEXT:    [[TMP68:%.*]] = load i64, ptr [[DOTOMP_FUSE_INDEX52]], align 8
+// CHECK1-NEXT:    [[MUL84:%.*]] = mul nsw i64 [[TMP67]], [[TMP68]]
+// CHECK1-NEXT:    [[ADD85:%.*]] = add nsw i64 [[TMP66]], [[MUL84]]
+// CHECK1-NEXT:    store i64 [[ADD85]], ptr [[DOTOMP_IV2]], align 8
+// CHECK1-NEXT:    [[TMP69:%.*]] = load ptr, ptr [[DOTCAPTURE_EXPR_27]], align 8
+// CHECK1-NEXT:    [[TMP70:%.*]] = load i64, ptr [[DOTOMP_IV2]], align 8
+// CHECK1-NEXT:    [[MUL86:%.*]] = mul nsw i64 [[TMP70]], 1
+// CHECK1-NEXT:    [[ADD_PTR87:%.*]] = getelementptr inbounds double, ptr [[TMP69]], i64 [[MUL86]]
+// CHECK1-NEXT:    store ptr [[ADD_PTR87]], ptr [[__BEGIN225]], align 8
+// CHECK1-NEXT:    [[TMP71:%.*]] = load ptr, ptr [[__BEGIN225]], align 8
+// CHECK1-NEXT:    store ptr [[TMP71]], ptr [[VV]], align 8
+// CHECK1-NEXT:    [[TMP72:%.*]] = load i32, ptr [[CC]], align 4
+// CHECK1-NEXT:    [[TMP73:%.*]] = load ptr, ptr [[VV]], align 8
+// CHECK1-NEXT:    [[TMP74:%.*]] = load double, ptr [[TMP73]], align 8
+// CHECK1-NEXT:    call void (...) @body(i32 noundef [[TMP72]], double noundef [[TMP74]])
+// CHECK1-NEXT:    br label %[[IF_END88]]
+// CHECK1:       [[IF_END88]]:
+// CHECK1-NEXT:    br label %[[FOR_INC:.*]]
+// CHECK1:       [[FOR_INC]]:
+// CHECK1-NEXT:    [[TMP75:%.*]] = load i64, ptr [[DOTOMP_FUSE_INDEX52]], align 8
+// CHECK1-NEXT:    [[INC:%.*]] = add nsw i64 [[TMP75]], 1
+// CHECK1-NEXT:    store i64 [[INC]], ptr [[DOTOMP_FUSE_INDEX52]], align 8
+// CHECK1-NEXT:    br label %[[FOR_COND]], !llvm.loop [[LOOP6:![0-9]+]]
+// CHECK1:       [[FOR_END]]:
+// CHECK1-NEXT:    ret void
+//
+//
+// CHECK1-LABEL: define dso_local void @foo4(
+// CHECK1-SAME: ) #[[ATTR0]] {
+// CHECK1-NEXT:  [[ENTRY:.*:]]
+// CHECK1-NEXT:    [[ARR:%.*]] = alloca [256 x double], align 16
+// CHECK1-NEXT:    [[J:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_LB0:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_ST0:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_NI0:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_IV0:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[K:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_LB1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_ST1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_NI1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_IV1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_TEMP_1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_FUSE_MAX:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_FUSE_INDEX:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[C:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[__RANGE2:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[__BEGIN2:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[__END2:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[V:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    store i32 0, ptr [[J]], align 4
+// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_LB0]], align 4
+// CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_ST0]], align 4
+// CHECK1-NEXT:    store i32 128, ptr [[DOTOMP_NI0]], align 4
+// CHECK1-NEXT:    store i32 0, ptr [[K]], align 4
+// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_LB1]], align 4
+// CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_ST1]], align 4
+// CHECK1-NEXT:    store i32 64, ptr [[DOTOMP_NI1]], align 4
+// CHECK1-NEXT:    [[TMP0:%.*]] = load i32, ptr [[DOTOMP_NI0]], align 4
+// CHECK1-NEXT:    store i32 [[TMP0]], ptr [[DOTOMP_TEMP_1]], align 4
+// CHECK1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTOMP_TEMP_1]], align 4
+// CHECK1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTOMP_NI1]], align 4
+// CHECK1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP1]], [[TMP2]]
+// CHECK1-NEXT:    br i1 [[CMP]], label %[[COND_TRUE:.*]], label %[[COND_FALSE:.*]]
+// CHECK1:       [[COND_TRUE]]:
+// CHECK1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTOMP_TEMP_1]], align 4
+// CHECK1-NEXT:    br label %[[COND_END:.*]]
+// CHECK1:       [[COND_FALSE]]:
+// CHECK1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_NI1]], align 4
+// CHECK1-NEXT:    br label %[[COND_END]]
+// CHECK1:       [[COND_END]]:
+// CHECK1-NEXT:    [[COND:%.*]] = phi i32 [ [[TMP3]], %[[COND_TRUE]] ], [ [[TMP4]], %[[COND_FALSE]] ]
+// CHECK1-NEXT:    store i32 [[COND]], ptr [[DOTOMP_FUSE_MAX]], align 4
+// CHECK1-NEXT:    store i32 0, ptr [[I]], align 4
+// CHECK1-NEXT:    br label %[[FOR_COND:.*]]
+// CHECK1:       [[FOR_COND]]:
+// CHECK1-NEXT:    [[TMP5:%.*]] = load i32, ptr [[I]], align 4
+// CHECK1-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP5]], 128
+// CHECK1-NEXT:    br i1 [[CMP1]], label %[[FOR_BODY:.*]], label %[[FOR_END:.*]]
+// CHECK1:       [[FOR_BODY]]:
+// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[I]], align 4
+// CHECK1-NEXT:    call void (...) @body(i32 noundef [[TMP6]])
+// CHECK1-NEXT:    br label %[[FOR_INC:.*]]
+// CHECK1:       [[FOR_INC]]:
+// CHECK1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[I]], align 4
+// CHECK1-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP7]], 1
+// CHECK1-NEXT:    store i32 [[INC]], ptr [[I]], align 4
+// CHECK1-NEXT:    br label %[[FOR_COND]], !llvm.loop [[LOOP7:![0-9]+]]
+// CHECK1:       [[FOR_END]]:
+// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK1-NEXT:    br label %[[FOR_COND2:.*]]
+// CHECK1:       [[FOR_COND2]]:
+// CHECK1-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_FUSE_MAX]], align 4
+// CHECK1-NEXT:    [[CMP3:%.*]] = icmp slt i32 [[TMP8]], [[TMP9]]
+// CHECK1-NEXT:    br i1 [[CMP3]], label %[[FOR_BODY4:.*]], label %[[FOR_END17:.*]]
+// CHECK1:       [[FOR_BODY4]]:
+// CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_NI0]], align 4
+// CHECK1-NEXT:    [[CMP5:%.*]] = icmp slt i32 [[TMP10]], [[TMP11]]
+// CHECK1-NEXT:    br i1 [[CMP5]], label %[[IF_THEN:.*]], label %[[IF_END:.*]]
+// CHECK1:       [[IF_THEN]]:
+// CHECK1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_LB0]], align 4
+// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_ST0]], align 4
+// CHECK1-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK1-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP13]], [[TMP14]]
+// CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP12]], [[MUL]]
+// CHECK1-NEXT:    store i32 [[ADD]], ptr [[DOTOMP_IV0]], align 4
+// CHECK1-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV0]], align 4
+// CHECK1-NEXT:    [[MUL6:%.*]] = mul nsw i32 [[TMP15]], 2
+// CHECK1-NEXT:    [[ADD7:%.*]] = add nsw i32 0, [[MUL6]]
+// CHECK1-NEXT:    store i32 [[ADD7]], ptr [[J]], align 4
+// CHECK1-NEXT:    [[TMP16:%.*]] = load i32, ptr [[J]], align 4
+// CHECK1-NEXT:    call void (...) @body(i32 noundef [[TMP16]])
+// CHECK1-NEXT:    br label %[[IF_END]]
+// CHECK1:       [[IF_END]]:
+// CHECK1-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK1-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_NI1]], align 4
+// CHECK1-NEXT:    [[CMP8:%.*]] = icmp slt i32 [[TMP17]], [[TMP18]]
+// CHECK1-NEXT:    br i1 [[CMP8]], label %[[IF_THEN9:.*]], label %[[IF_END14:.*]]
+// CHECK1:       [[IF_THEN9]]:
+// CHECK1-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_LB1]], align 4
+// CHECK1-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_ST1]], align 4
+// CHECK1-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK1-NEXT:    [[MUL10:%.*]] = mul nsw i32 [[TMP20]], [[TMP21]]
+// CHECK1-NEXT:    [[ADD11:%.*]] = add nsw i32 [[TMP19]], [[MUL10]]
+// CHECK1-NEXT:    store i32 [[ADD11]], ptr [[DOTOMP_IV1]], align 4
+// CHECK1-NEXT:    [[TMP22:%.*]] = load i32, ptr [[DOTOMP_IV1]], align 4
+// CHECK1-NEXT:    [[MUL12:%.*]] = mul nsw i32 [[TMP22]], 1
+// CHECK1-NEXT:    [[ADD13:%.*]] = add nsw i32 0, [[MUL12]]
+// CHECK1-NEXT:    store i32 [[ADD13]], ptr [[K]], align 4
+// CHECK1-NEXT:    [[TMP23:%.*]] = load i32, ptr [[K]], align 4
+// CHECK1-NEXT:    call void (...) @body(i32 noundef [[TMP23]])
+// CHECK1-NEXT:    br label %[[IF_END14]]
+// CHECK1:       [[IF_END14]]:
+// CHECK1-NEXT:    br label %[[FOR_INC15:.*]]
+// CHECK1:       [[FOR_INC15]]:
+// CHECK1-NEXT:    [[TMP24:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK1-NEXT:    [[INC16:%.*]] = add nsw i32 [[TMP24]], 1
+// CHECK1-NEXT:    store i32 [[INC16]], ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK1-NEXT:    br label %[[FOR_COND2]], !llvm.loop [[LOOP8:![0-9]+]]
+// CHECK1:       [[FOR_END17]]:
+// CHECK1-NEXT:    store i32 42, ptr [[C]], align 4
+// CHECK1-NEXT:    store ptr [[ARR]], ptr [[__RANGE2]], align 8
+// CHECK1-NEXT:    [[TMP25:%.*]] = load ptr, ptr [[__RANGE2]], align 8
+// CHECK1-NEXT:    [[ARRAYDECAY:%.*]] = getelementptr inbounds [256 x double], ptr [[TMP25]], i64 0, i64 0
+// CHECK1-NEXT:    store ptr [[ARRAYDECAY]], ptr [[__BEGIN2]], align 8
+// CHECK1-NEXT:    [[TMP26:%.*]] = load ptr, ptr [[__RANGE2]], align 8
+// CHECK1-NEXT:    [[ARRAYDECAY18:%.*]] = getelementptr inbounds [256 x double], ptr [[TMP26]], i64 0, i64 0
+// CHECK1-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds double, ptr [[ARRAYDECAY18]], i64 256
+// CHECK1-NEXT:    store ptr [[ADD_PTR]], ptr [[__END2]], align 8
+// CHECK1-NEXT:    br label %[[FOR_COND19:.*]]
+// CHECK1:       [[FOR_COND19]]:
+// CHECK1-NEXT:    [[TMP27:%.*]] = load ptr, ptr [[__BEGIN2]], align 8
+// CHECK1-NEXT:    [[TMP28:%.*]] = load ptr, ptr [[__END2]], align 8
+// CHECK1-NEXT:    [[CMP20:%.*]] = icmp ne ptr [[TMP27]], [[TMP28]]
+// CHECK1-NEXT:    br i1 [[CMP20]], label %[[FOR_BODY21:.*]], label %[[FOR_END23:.*]]
+// CHECK1:       [[FOR_BODY21]]:
+// CHECK1-NEXT:    [[TMP29:%.*]] = load ptr, ptr [[__BEGIN2]], align 8
+// CHECK1-NEXT:    store ptr [[TMP29]], ptr [[V]], align 8
+// CHECK1-NEXT:    [[TMP30:%.*]] = load i32, ptr [[C]], align 4
+// CHECK1-NEXT:    [[TMP31:%.*]] = load ptr, ptr [[V]], align 8
+// CHECK1-NEXT:    [[TMP32:%.*]] = load double, ptr [[TMP31]], align 8
+// CHECK1-NEXT:    call void (...) @body(i32 noundef [[TMP30]], double noundef [[TMP32]])
+// CHECK1-NEXT:    br label %[[FOR_INC22:.*]]
+// CHECK1:       [[FOR_INC22]]:
+// CHECK1-NEXT:    [[TMP33:%.*]] = load ptr, ptr [[__BEGIN2]], align 8
+// CHECK1-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw double, ptr [[TMP33]], i32 1
+// CHECK1-NEXT:    store ptr [[INCDEC_PTR]], ptr [[__BEGIN2]], align 8
+// CHECK1-NEXT:    br label %[[FOR_COND19]]
+// CHECK1:       [[FOR_END23]]:
+// CHECK1-NEXT:    ret void
+//
+//
+// CHECK1-LABEL: define dso_local void @foo5(
+// CHECK1-SAME: ) #[[ATTR0]] {
+// CHECK1-NEXT:  [[ENTRY:.*:]]
+// CHECK1-NEXT:    [[ARR:%.*]] = alloca [256 x double], align 16
+// CHECK1-NEXT:    [[J:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_LB0:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_ST0:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_NI0:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_IV0:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[K:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_LB1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_ST1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_NI1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_IV1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_TEMP_1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_FUSE_MAX:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_FUSE_INDEX:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_LB03:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_ST04:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_NI05:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[DOTOMP_IV06:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[C:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[__RANGE2:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[__END2:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[__BEGIN2:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[DOTCAPTURE_EXPR_8:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[DOTCAPTURE_EXPR_10:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[DOTCAPTURE_EXPR_11:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[DOTOMP_LB116:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[DOTOMP_ST117:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[DOTOMP_NI118:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[DOTOMP_IV120:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[DOTOMP_TEMP_121:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[DOTOMP_FUSE_MAX22:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_FUSE_INDEX29:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[V:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[CC:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[__RANGE264:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[__BEGIN265:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[__END267:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[VV:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    store i32 0, ptr [[J]], align 4
+// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_LB0]], align 4
+// CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_ST0]], align 4
+// CHECK1-NEXT:    store i32 128, ptr [[DOTOMP_NI0]], align 4
+// CHECK1-NEXT:    store i32 0, ptr [[K]], align 4
+// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_LB1]], align 4
+// CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_ST1]], align 4
+// CHECK1-NEXT:    store i32 512, ptr [[DOTOMP_NI1]], align 4
+// CHECK1-NEXT:    [[TMP0:%.*]] = load i32, ptr [[DOTOMP_NI0]], align 4
+// CHECK1-NEXT:    store i32 [[TMP0]], ptr [[DOTOMP_TEMP_1]], align 4
+// CHECK1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTOMP_TEMP_1]], align 4
+// CHECK1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTOMP_NI1]], align 4
+// CHECK1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP1]], [[TMP2]]
+// CHECK1-NEXT:    br i1 [[CMP]], label %[[COND_TRUE:.*]], label %[[COND_FALSE:.*]]
+// CHECK1:       [[COND_TRUE]]:
+// CHECK1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTOMP_TEMP_1]], align 4
+// CHECK1-NEXT:    br label %[[COND_END:.*]]
+// CHECK1:       [[COND_FALSE]]:
+// CHECK1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_NI1]], align 4
+// CHECK1-NEXT:    br label %[[COND_END]]
+// CHECK1:       [[COND_END]]:
+// CHECK1-NEXT:    [[COND:%.*]] = phi i32 [ [[TMP3]], %[[COND_TRUE]] ], [ [[TMP4]], %[[COND_FALSE]] ]
+// CHECK1-NEXT:    store i32 [[COND]], ptr [[DOTOMP_FUSE_MAX]], align 4
+// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK1-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_FUSE_MAX]], align 4
+// CHECK1-NEXT:    store i32 [[TMP5]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT:    [[SUB:%.*]] = sub nsw i32 [[TMP6]], 0
+// CHECK1-NEXT:    [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK1-NEXT:    [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1
+// CHECK1-NEXT:    store i32 [[SUB2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_LB03]], align 4
+// CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_ST04]], align 4
+// CHECK1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP7]], 1
+// CHECK1-NEXT:    [[CONV:%.*]] = sext i32 [[ADD]] to i64
+// CHECK1-NEXT:    store i64 [[CONV]], ptr [[DOTOMP_NI05]], align 8
+// CHECK1-NEXT:    store i32 42, ptr [[C]], align 4
+// CHECK1-NEXT:    store ptr [[ARR]], ptr [[__RANGE2]], align 8
+// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[__RANGE2]], align 8
+// CHECK1-NEXT:    [[ARRAYDECAY:%.*]] = getelementptr inbounds [256 x double], ptr [[TMP8]], i64 0, i64 0
+// CHECK1-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds double, ptr [[ARRAYDECAY]], i64 256
+// CHECK1-NEXT:    store ptr [[ADD_PTR]], ptr [[__END2]], align 8
+// CHECK1-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[__RANGE2]], align 8
+// CHECK1-NEXT:    [[ARRAYDECAY7:%.*]] = getelementptr inbounds [256 x double], ptr [[TMP9]], i64 0, i64 0
+// CHECK1-NEXT:    store ptr [[ARRAYDECAY7]], ptr [[__BEGIN2]], align 8
+// CHECK1-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[__RANGE2]], align 8
+// CHECK1-NEXT:    [[ARRAYDECAY9:%.*]] = getelementptr inbounds [256 x double], ptr [[TMP10]], i64 0, i64 0
+// CHECK1-NEXT:    store ptr [[ARRAYDECAY9]], ptr [[DOTCAPTURE_EXPR_8]], align 8
+// CHECK1-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[__END2]], align 8
+// CHECK1-NEXT:    store ptr [[TMP11]], ptr [[DOTCAPTURE_EXPR_10]], align 8
+// CHECK1-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[DOTCAPTURE_EXPR_10]], align 8
+// CHECK1-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[DOTCAPTURE_EXPR_8]], align 8
+// CHECK1-NEXT:    [[SUB_PTR_LHS_CAST:%.*]] = ptrtoint ptr [[TMP12]] to i64
+// CHECK1-NEXT:    [[SUB_PTR_RHS_CAST:%.*]] = ptrtoint ptr [[TMP13]] to i64
+// CHECK1-NEXT:    [[SUB_PTR_SUB:%.*]] = sub i64 [[SUB_PTR_LHS_CAST]], [[SUB_PTR_RHS_CAST]]
+// CHECK1-NEXT:    [[SUB_PTR_DIV:%.*]] = sdiv exact i64 [[SUB_PTR_SUB]], 8
+// CHECK1-NEXT:    [[SUB12:%.*]] = sub nsw i64 [[SUB_PTR_DIV]], 1
+// CHECK1-NEXT:    [[ADD13:%.*]] = add nsw i64 [[SUB12]], 1
+// CHECK1-NEXT:    [[DIV14:%.*]] = sdiv i64 [[ADD13]], 1
+// CHECK1-NEXT:    [[SUB15:%.*]] = sub nsw i64 [[DIV14]], 1
+// CHECK1-NEXT:    store i64 [[SUB15]], ptr [[DOTCAPTURE_EXPR_11]], align 8
+// CHECK1-NEXT:    store i64 0, ptr [[DOTOMP_LB116]], align 8
+// CHECK1-NEXT:    store i64 1, ptr [[DOTOMP_ST117]], align 8
+// CHECK1-NEXT:    [[TMP14:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_11]], align 8
+// CHECK1-NEXT:    [[ADD19:%.*]] = add nsw i64 [[TMP14]], 1
+// CHECK1-NEXT:    store i64 [[ADD19]], ptr [[DOTOMP_NI118]], align 8
+// CHECK1-NEXT:    [[TMP15:%.*]] = load i64, ptr [[DOTOMP_NI05]], align 8
+// CHECK1-NEXT:    store i64 [[TMP15]], ptr [[DOTOMP_TEMP_121]], align 8
+// CHECK1-NEXT:    [[TMP16:%.*]] = load i64, ptr [[DOTOMP_TEMP_121]], align 8
+// CHECK1-NEXT:    [[TMP17:%.*]] = load i64, ptr [[DOTOMP_NI118]], align 8
+// CHECK1-NEXT:    [[CMP23:%.*]] = icmp sgt i64 [[TMP16]], [[TMP17]]
+// CHECK1-NEXT:    br i1 [[CMP23]], label %[[COND_TRUE24:.*]], label %[[COND_FALSE25:.*]]
+// CHECK1:       [[COND_TRUE24]]:
+// CHECK1-NEXT:    [[TMP18:%.*]] = load i64, ptr [[DOTOMP_TEMP_121]], align 8
+// CHECK1-NEXT:    br label %[[COND_END26:.*]]
+// CHECK1:       [[COND_FALSE25]]:
+// CHECK1-NEXT:    [[TMP19:%.*]] = load i64, ptr [[DOTOMP_NI118]], align 8
+// CHECK1-NEXT:    br label %[[COND_END26]]
+// CHECK1:       [[COND_END26]]:
+// CHECK1-NEXT:    [[COND27:%.*]] = phi i64 [ [[TMP18]], %[[COND_TRUE24]] ], [ [[TMP19]], %[[COND_FALSE25]] ]
+// CHECK1-NEXT:    store i64 [[COND27]], ptr [[DOTOMP_FUSE_MAX22]], align 8
+// CHECK1-NEXT:    store i32 0, ptr [[I]], align 4
+// CHECK1-NEXT:    br label %[[FOR_COND:.*]]
+// CHECK1:       [[FOR_COND]]:
+// CHECK1-NEXT:    [[TMP20:%.*]] = load i32, ptr [[I]], align 4
+// CHECK1-NEXT:    [[CMP28:%.*]] = icmp slt i32 [[TMP20]], 128
+// CHECK1-NEXT:    br i1 [[CMP28]], label %[[FOR_BODY:.*]], label %[[FOR_END:.*]]
+// CHECK1:       [[FOR_BODY]]:
+// CHECK1-NEXT:    [[TMP21:%.*]] = load i32, ptr [[I]], align 4
+// CHECK1-NEXT:    call void (...) @body(i32 noundef [[TMP21]])
+// CHECK1-NEXT:    br label %[[FOR_INC:.*]]
+// CHECK1:       [[FOR_INC]]:
+// CHECK1-NEXT:    [[TMP22:%.*]] = load i32, ptr [[I]], align 4
+// CHECK1-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP22]], 1
+// CHECK1-NEXT:    store i32 [[INC]], ptr [[I]], align 4
+// CHECK1-NEXT:    br label %[[FOR_COND]], !llvm.loop [[LOOP9:![0-9]+]]
+// CHECK1:       [[FOR_END]]:
+// CHECK1-NEXT:    store i64 0, ptr [[DOTOMP_FUSE_INDEX29]], align 8
+// CHECK1-NEXT:    br label %[[FOR_COND30:.*]]
+// CHECK1:       [[FOR_COND30]]:
+// CHECK1-NEXT:    [[TMP23:%.*]] = load i64, ptr [[DOTOMP_FUSE_INDEX29]], align 8
+// CHECK1-NEXT:    [[TMP24:%.*]] = load i64, ptr [[DOTOMP_FUSE_MAX22]], align 8
+// CHECK1-NEXT:    [[CMP31:%.*]] = icmp slt i64 [[TMP23]], [[TMP24]]
+// CHECK1-NEXT:    br i1 [[CMP31]], label %[[FOR_BODY32:.*]], label %[[FOR_END63:.*]]
+// CHECK1:       [[FOR_BODY32]]:
+// CHECK1-NEXT:    [[TMP25:%.*]] = load i64, ptr [[DOTOMP_FUSE_INDEX29]], align 8
+// CHECK1-NEXT:    [[TMP26:%.*]] = load i64, ptr [[DOTOMP_NI05]], align 8
+// CHECK1-NEXT:    [[CMP33:%.*]] = icmp slt i64 [[TMP25]], [[TMP26]]
+// CHECK1-NEXT:    br i1 [[CMP33]], label %[[IF_THEN:.*]], label %[[IF_END53:.*]]
+// CHECK1:       [[IF_THEN]]:
+// CHECK1-NEXT:    [[TMP27:%.*]] = load i32, ptr [[DOTOMP_LB03]], align 4
+// CHECK1-NEXT:    [[CONV34:%.*]] = sext i32 [[TMP27]] to i64
+// CHECK1-NEXT:    [[TMP28:%.*]] = load i32, ptr [[DOTOMP_ST04]], align 4
+// CHECK1-NEXT:    [[CONV35:%.*]] = sext i32 [[TMP28]] to i64
+// CHECK1-NEXT:    [[TMP29:%.*]] = load i64, ptr [[DOTOMP_FUSE_INDEX29]], align 8
+// CHECK1-NEXT:    [[MUL:%.*]] = mul nsw i64 [[CONV35]], [[TMP29]]
+// CHECK1-NEXT:    [[ADD36:%.*]] = add nsw i64 [[CONV34]], [[MUL]]
+// CHECK1-NEXT:    [[CONV37:%.*]] = trunc i64 [[ADD36]] to i32
+// CHECK1-NEXT:    store i32 [[CONV37]], ptr [[DOTOMP_IV06]], align 4
+// CHECK1-NEXT:    [[TMP30:%.*]] = load i32, ptr [[DOTOMP_IV06]], align 4
+// CHECK1-NEXT:    [[MUL38:%.*]] = mul nsw i32 [[TMP30]], 1
+// CHECK1-NEXT:    [[ADD39:%.*]] = add nsw i32 0, [[MUL38]]
+// CHECK1-NEXT:    store i32 [[ADD39]], ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK1-NEXT:    [[TMP31:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK1-NEXT:    [[TMP32:%.*]] = load i32, ptr [[DOTOMP_NI0]], align 4
+// CHECK1-NEXT:    [[CMP40:%.*]] = icmp slt i32 [[TMP31]], [[TMP32]]
+// CHECK1-NEXT:    br i1 [[CMP40]], label %[[IF_THEN41:.*]], label %[[IF_END:.*]]
+// CHECK1:       [[IF_THEN41]]:
+// CHECK1-NEXT:    [[TMP33:%.*]] = load i32, ptr [[DOTOMP_LB0]], align 4
+// CHECK1-NEXT:    [[TMP34:%.*]] = load i32, ptr [[DOTOMP_ST0]], align 4
+// CHECK1-NEXT:    [[TMP35:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK1-NEXT:    [[MUL42:%.*]] = mul nsw i32 [[TMP34]], [[TMP35]]
+// CHECK1-NEXT:    [[ADD43:%.*]] = add nsw i32 [[TMP33]], [[MUL42]]
+// CHECK1-NEXT:    store i32 [[ADD43]], ptr [[DOTOMP_IV0]], align 4
+// CHECK1-NEXT:    [[TMP36:%.*]] = load i32, ptr [[DOTOMP_IV0]], align 4
+// CHECK1-NEXT:    [[MUL44:%.*]] = mul nsw i32 [[TMP36]], 2
+// CHECK1-NEXT:    [[ADD45:%.*]] = add nsw i32 0, [[MUL44]]
+// CHECK1-NEXT:    store i32 [[ADD45]], ptr [[J]], align 4
+// CHECK1-NEXT:    [[TMP37:%.*]] = load i32, ptr [[J]], align 4
+// CHECK1-NEXT:    call void (...) @body(i32 noundef [[TMP37]])
+// CHECK1-NEXT:    br label %[[IF_END]]
+// CHECK1:       [[IF_END]]:
+// CHECK1-NEXT:    [[TMP38:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK1-NEXT:    [[TMP39:%.*]] = load i32, ptr [[DOTOMP_NI1]], align 4
+// CHECK1-NEXT:    [[CMP46:%.*]] = icmp slt i32 [[TMP38]], [[TMP39]]
+// CHECK1-NEXT:    br i1 [[CMP46]], label %[[IF_THEN47:.*]], label %[[IF_END52:.*]]
+// CHECK1:       [[IF_THEN47]]:
+// CHECK1-NEXT:    [[TMP40:%.*]] = load i32, ptr [[DOTOMP_LB1]], align 4
+// CHECK1-NEXT:    [[TMP41:%.*]] = load i32, ptr [[DOTOMP_ST1]], align 4
+// CHECK1-NEXT:    [[TMP42:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK1-NEXT:    [[MUL48:%.*]] = mul nsw i32 [[TMP41]], [[TMP42]]
+// CHECK1-NEXT:    [[ADD49:%.*]] = add nsw i32 [[TMP40]], [[MUL48]]
+// CHECK1-NEXT:    store i32 [[ADD49]], ptr [[DOTOMP_IV1]], align 4
+// CHECK1-NEXT:    [[TMP43:%.*]] = load i32, ptr [[DOTOMP_IV1]], align 4
+// CHECK1-NEXT:    [[MUL50:%.*]] = mul nsw i32 [[TMP43]], 1
+// CHECK1-NEXT:    [[ADD51:%.*]] = add nsw i32 0, [[MUL50]]
+// CHECK1-NEXT:    store i32 [[ADD51]], ptr [[K]], align 4
+// CHECK1-NEXT:    [[TMP44:%.*]] = load i32, ptr [[K]], align 4
+// CHECK1-NEXT:    call void (...) @body(i32 noundef [[TMP44]])
+// CHECK1-NEXT:    br label %[[IF_END52]]
+// CHECK1:       [[IF_END52]]:
+// CHECK1-NEXT:    br label %[[IF_END53]]
+// CHECK1:       [[IF_END53]]:
+// CHECK1-NEXT:    [[TMP45:%.*]] = load i64, ptr [[DOTOMP_FUSE_INDEX29]], align 8
+// CHECK1-NEXT:    [[TMP46:%.*]] = load i64, ptr [[DOTOMP_NI118]], align 8
+// CHECK1-NEXT:    [[CMP54:%.*]] = icmp slt i64 [[TMP45]], [[TMP46]]
+// CHECK1-NEXT:    br i1 [[CMP54]], label %[[IF_THEN55:.*]], label %[[IF_END60:.*]]
+// CHECK1:       [[IF_THEN55]]:
+// CHECK1-NEXT:    [[TMP47:%.*]] = load i64, ptr [[DOTOMP_LB116]], align 8
+// CHECK1-NEXT:    [[TMP48:%.*]] = load i64, ptr [[DOTOMP_ST117]], align 8
+// CHECK1-NEXT:    [[TMP49:%.*]] = load i64, ptr [[DOTOMP_FUSE_INDEX29]], align 8
+// CHECK1-NEXT:    [[MUL56:%.*]] = mul nsw i64 [[TMP48]], [[TMP49]]
+// CHECK1-NEXT:    [[ADD57:%.*]] = add nsw i64 [[TMP47]], [[MUL56]]
+// CHECK1-NEXT:    store i64 [[ADD57]], ptr [[DOTOMP_IV120]], align 8
+// CHECK1-NEXT:    [[TMP50:%.*]] = load ptr, ptr [[DOTCAPTURE_EXPR_8]], align 8
+// CHECK1-NEXT:    [[TMP51:%.*]] = load i64, ptr [[DOTOMP_IV120]], align 8
+// CHECK1-NEXT:    [[MUL58:%.*]] = mul nsw i64 [[TMP51]], 1
+// CHECK1-NEXT:    [[ADD_PTR59:%.*]] = getelementptr inbounds double, ptr [[TMP50]], i64 [[MUL58]]
+// CHECK1-NEXT:    store ptr [[ADD_PTR59]], ptr [[__BEGIN2]], align 8
+// CHECK1-NEXT:    [[TMP52:%.*]] = load ptr, ptr [[__BEGIN2]], align 8
+// CHECK1-NEXT:    store ptr [[TMP52]], ptr [[V]], align 8
+// CHECK1-NEXT:    [[TMP53:%.*]] = load i32, ptr [[C]], align 4
+// CHECK1-NEXT:    [[TMP54:%.*]] = load ptr, ptr [[V]], align 8
+// CHECK1-NEXT:    [[TMP55:%.*]] = load double, ptr [[TMP54]], align 8
+// CHECK1-NEXT:    call void (...) @body(i32 noundef [[TMP53]], double noundef [[TMP55]])
+// CHECK1-NEXT:    br label %[[IF_END60]]
+// CHECK1:       [[IF_END60]]:
+// CHECK1-NEXT:    br label %[[FOR_INC61:.*]]
+// CHECK1:       [[FOR_INC61]]:
+// CHECK1-NEXT:    [[TMP56:%.*]] = load i64, ptr [[DOTOMP_FUSE_INDEX29]], align 8
+// CHECK1-NEXT:    [[INC62:%.*]] = add nsw i64 [[TMP56]], 1
+// CHECK1-NEXT:    store i64 [[INC62]], ptr [[DOTOMP_FUSE_INDEX29]], align 8
+// CHECK1-NEXT:    br label %[[FOR_COND30]], !llvm.loop [[LOOP10:![0-9]+]]
+// CHECK1:       [[FOR_END63]]:
+// CHECK1-NEXT:    store i32 37, ptr [[CC]], align 4
+// CHECK1-NEXT:    store ptr [[ARR]], ptr [[__RANGE264]], align 8
+// CHECK1-NEXT:    [[TMP57:%.*]] = load ptr, ptr [[__RANGE264]], align 8
+// CHECK1-NEXT:    [[ARRAYDECAY66:%.*]] = getelementptr inbounds [256 x double], ptr [[TMP57]], i64 0, i64 0
+// CHECK1-NEXT:    store ptr [[ARRAYDECAY66]], ptr [[__BEGIN265]], align 8
+// CHECK1-NEXT:    [[TMP58:%.*]] = load ptr, ptr [[__RANGE264]], align 8
+// CHECK1-NEXT:    [[ARRAYDECAY68:%.*]] = getelementptr inbounds [256 x double], ptr [[TMP58]], i64 0, i64 0
+// CHECK1-NEXT:    [[ADD_PTR69:%.*]] = getelementptr inbounds double, ptr [[ARRAYDECAY68]], i64 256
+// CHECK1-NEXT:    store ptr [[ADD_PTR69]], ptr [[__END267]], align 8
+// CHECK1-NEXT:    br label %[[FOR_COND70:.*]]
+// CHECK1:       [[FOR_COND70]]:
+// CHECK1-NEXT:    [[TMP59:%.*]] = load ptr, ptr [[__BEGIN265]], align 8
+// CHECK1-NEXT:    [[TMP60:%.*]] = load ptr, ptr [[__END267]], align 8
+// CHECK1-NEXT:    [[CMP71:%.*]] = icmp ne ptr [[TMP59]], [[TMP60]]
+// CHECK1-NEXT:    br i1 [[CMP71]], label %[[FOR_BODY72:.*]], label %[[FOR_END74:.*]]
+// CHECK1:       [[FOR_BODY72]]:
+// CHECK1-NEXT:    [[TMP61:%.*]] = load ptr, ptr [[__BEGIN265]], align 8
+// CHECK1-NEXT:    store ptr [[TMP61]], ptr [[VV]], align 8
+// CHECK1-NEXT:    [[TMP62:%.*]] = load i32, ptr [[CC]], align 4
+// CHECK1-NEXT:    [[TMP63:%.*]] = load ptr, ptr [[VV]], align 8
+// CHECK1-NEXT:    [[TMP64:%.*]] = load double, ptr [[TMP63]], align 8
+// CHECK1-NEXT:    call void (...) @body(i32 noundef [[TMP62]], double noundef [[TMP64]])
+// CHECK1-NEXT:    br label %[[FOR_INC73:.*]]
+// CHECK1:       [[FOR_INC73]]:
+// CHECK1-NEXT:    [[TMP65:%.*]] = load ptr, ptr [[__BEGIN265]], align 8
+// CHECK1-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw double, ptr [[TMP65]], i32 1
+// CHECK1-NEXT:    store ptr [[INCDEC_PTR]], ptr [[__BEGIN265]], align 8
+// CHECK1-NEXT:    br label %[[FOR_COND70]]
+// CHECK1:       [[FOR_END74]]:
+// CHECK1-NEXT:    ret void
+//
+//
+// CHECK2-LABEL: define dso_local void @body(
+// CHECK2-SAME: ...) #[[ATTR0:[0-9]+]] {
+// CHECK2-NEXT:  [[ENTRY:.*:]]
+// CHECK2-NEXT:    ret void
+//
+//
+// CHECK2-LABEL: define dso_local void @foo1(
+// CHECK2-SAME: i32 noundef [[START1:%.*]], i32 noundef [[END1:%.*]], i32 noundef [[STEP1:%.*]], i32 noundef [[START2:%.*]], i32 noundef [[END2:%.*]], i32 noundef [[STEP2:%.*]]) #[[ATTR0]] {
+// CHECK2-NEXT:  [[ENTRY:.*:]]
+// CHECK2-NEXT:    [[START1_ADDR:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[END1_ADDR:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[STEP1_ADDR:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[START2_ADDR:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[END2_ADDR:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[STEP2_ADDR:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[J:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTNEW_STEP:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_LB0:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_ST0:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_NI0:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_IV0:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTCAPTURE_EXPR_6:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTCAPTURE_EXPR_7:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTNEW_STEP8:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTCAPTURE_EXPR_9:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_LB1:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_ST1:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_NI1:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_IV1:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_TEMP_1:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_FUSE_MAX:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_FUSE_INDEX:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    store i32 [[START1]], ptr [[START1_ADDR]], align 4
+// CHECK2-NEXT:    store i32 [[END1]], ptr [[END1_ADDR]], align 4
+// CHECK2-NEXT:    store i32 [[STEP1]], ptr [[STEP1_ADDR]], align 4
+// CHECK2-NEXT:    store i32 [[START2]], ptr [[START2_ADDR]], align 4
+// CHECK2-NEXT:    store i32 [[END2]], ptr [[END2_ADDR]], align 4
+// CHECK2-NEXT:    store i32 [[STEP2]], ptr [[STEP2_ADDR]], align 4
+// CHECK2-NEXT:    [[TMP0:%.*]] = load i32, ptr [[START1_ADDR]], align 4
+// CHECK2-NEXT:    store i32 [[TMP0]], ptr [[I]], align 4
+// CHECK2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[START1_ADDR]], align 4
+// CHECK2-NEXT:    store i32 [[TMP1]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[END1_ADDR]], align 4
+// CHECK2-NEXT:    store i32 [[TMP2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK2-NEXT:    [[TMP3:%.*]] = load i32, ptr [[STEP1_ADDR]], align 4
+// CHECK2-NEXT:    store i32 [[TMP3]], ptr [[DOTNEW_STEP]], align 4
+// CHECK2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK2-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK2-NEXT:    [[SUB:%.*]] = sub i32 [[TMP4]], [[TMP5]]
+// CHECK2-NEXT:    [[SUB3:%.*]] = sub i32 [[SUB]], 1
+// CHECK2-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTNEW_STEP]], align 4
+// CHECK2-NEXT:    [[ADD:%.*]] = add i32 [[SUB3]], [[TMP6]]
+// CHECK2-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTNEW_STEP]], align 4
+// CHECK2-NEXT:    [[DIV:%.*]] = udiv i32 [[ADD]], [[TMP7]]
+// CHECK2-NEXT:    [[SUB4:%.*]] = sub i32 [[DIV]], 1
+// CHECK2-NEXT:    store i32 [[SUB4]], ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK2-NEXT:    store i32 0, ptr [[DOTOMP_LB0]], align 4
+// CHECK2-NEXT:    store i32 1, ptr [[DOTOMP_ST0]], align 4
+// CHECK2-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK2-NEXT:    [[ADD5:%.*]] = add i32 [[TMP8]], 1
+// CHECK2-NEXT:    store i32 [[ADD5]], ptr [[DOTOMP_NI0]], align 4
+// CHECK2-NEXT:    [[TMP9:%.*]] = load i32, ptr [[START2_ADDR]], align 4
+// CHECK2-NEXT:    store i32 [[TMP9]], ptr [[J]], align 4
+// CHECK2-NEXT:    [[TMP10:%.*]] = load i32, ptr [[START2_ADDR]], align 4
+// CHECK2-NEXT:    store i32 [[TMP10]], ptr [[DOTCAPTURE_EXPR_6]], align 4
+// CHECK2-NEXT:    [[TMP11:%.*]] = load i32, ptr [[END2_ADDR]], align 4
+// CHECK2-NEXT:    store i32 [[TMP11]], ptr [[DOTCAPTURE_EXPR_7]], align 4
+// CHECK2-NEXT:    [[TMP12:%.*]] = load i32, ptr [[STEP2_ADDR]], align 4
+// CHECK2-NEXT:    store i32 [[TMP12]], ptr [[DOTNEW_STEP8]], align 4
+// CHECK2-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_7]], align 4
+// CHECK2-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_6]], align 4
+// CHECK2-NEXT:    [[SUB10:%.*]] = sub i32 [[TMP13]], [[TMP14]]
+// CHECK2-NEXT:    [[SUB11:%.*]] = sub i32 [[SUB10]], 1
+// CHECK2-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTNEW_STEP8]], align 4
+// CHECK2-NEXT:    [[ADD12:%.*]] = add i32 [[SUB11]], [[TMP15]]
+// CHECK2-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTNEW_STEP8]], align 4
+// CHECK2-NEXT:    [[DIV13:%.*]] = udiv i32 [[ADD12]], [[TMP16]]
+// CHECK2-NEXT:    [[SUB14:%.*]] = sub i32 [[DIV13]], 1
+// CHECK2-NEXT:    store i32 [[SUB14]], ptr [[DOTCAPTURE_EXPR_9]], align 4
+// CHECK2-NEXT:    store i32 0, ptr [[DOTOMP_LB1]], align 4
+// CHECK2-NEXT:    store i32 1, ptr [[DOTOMP_ST1]], align 4
+// CHECK2-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_9]], align 4
+// CHECK2-NEXT:    [[ADD15:%.*]] = add i32 [[TMP17]], 1
+// CHECK2-NEXT:    store i32 [[ADD15]], ptr [[DOTOMP_NI1]], align 4
+// CHECK2-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_NI0]], align 4
+// CHECK2-NEXT:    store i32 [[TMP18]], ptr [[DOTOMP_TEMP_1]], align 4
+// CHECK2-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_TEMP_1]], align 4
+// CHECK2-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_NI1]], align 4
+// CHECK2-NEXT:    [[CMP:%.*]] = icmp ugt i32 [[TMP19]], [[TMP20]]
+// CHECK2-NEXT:    br i1 [[CMP]], label %[[COND_TRUE:.*]], label %[[COND_FALSE:.*]]
+// CHECK2:       [[COND_TRUE]]:
+// CHECK2-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTOMP_TEMP_1]], align 4
+// CHECK2-NEXT:    br label %[[COND_END:.*]]
+// CHECK2:       [[COND_FALSE]]:
+// CHECK2-NEXT:    [[TMP22:%.*]] = load i32, ptr [[DOTOMP_NI1]], align 4
+// CHECK2-NEXT:    br label %[[COND_END]]
+// CHECK2:       [[COND_END]]:
+// CHECK2-NEXT:    [[COND:%.*]] = phi i32 [ [[TMP21]], %[[COND_TRUE]] ], [ [[TMP22]], %[[COND_FALSE]] ]
+// CHECK2-NEXT:    store i32 [[COND]], ptr [[DOTOMP_FUSE_MAX]], align 4
+// CHECK2-NEXT:    store i32 0, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK2-NEXT:    br label %[[FOR_COND:.*]]
+// CHECK2:       [[FOR_COND]]:
+// CHECK2-NEXT:    [[TMP23:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK2-NEXT:    [[TMP24:%.*]] = load i32, ptr [[DOTOMP_FUSE_MAX]], align 4
+// CHECK2-NEXT:    [[CMP16:%.*]] = icmp ult i32 [[TMP23]], [[TMP24]]
+// CHECK2-NEXT:    br i1 [[CMP16]], label %[[FOR_BODY:.*]], label %[[FOR_END:.*]]
+// CHECK2:       [[FOR_BODY]]:
+// CHECK2-NEXT:    [[TMP25:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK2-NEXT:    [[TMP26:%.*]] = load i32, ptr [[DOTOMP_NI0]], align 4
+// CHECK2-NEXT:    [[CMP17:%.*]] = icmp ult i32 [[TMP25]], [[TMP26]]
+// CHECK2-NEXT:    br i1 [[CMP17]], label %[[IF_THEN:.*]], label %[[IF_END:.*]]
+// CHECK2:       [[IF_THEN]]:
+// CHECK2-NEXT:    [[TMP27:%.*]] = load i32, ptr [[DOTOMP_LB0]], align 4
+// CHECK2-NEXT:    [[TMP28:%.*]] = load i32, ptr [[DOTOMP_ST0]], align 4
+// CHECK2-NEXT:    [[TMP29:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK2-NEXT:    [[MUL:%.*]] = mul i32 [[TMP28]], [[TMP29]]
+// CHECK2-NEXT:    [[ADD18:%.*]] = add i32 [[TMP27]], [[MUL]]
+// CHECK2-NEXT:    store i32 [[ADD18]], ptr [[DOTOMP_IV0]], align 4
+// CHECK2-NEXT:    [[TMP30:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK2-NEXT:    [[TMP31:%.*]] = load i32, ptr [[DOTOMP_IV0]], align 4
+// CHECK2-NEXT:    [[TMP32:%.*]] = load i32, ptr [[DOTNEW_STEP]], align 4
+// CHECK2-NEXT:    [[MUL19:%.*]] = mul i32 [[TMP31]], [[TMP32]]
+// CHECK2-NEXT:    [[ADD20:%.*]] = add i32 [[TMP30]], [[MUL19]]
+// CHECK2-NEXT:    store i32 [[ADD20]], ptr [[I]], align 4
+// CHECK2-NEXT:    [[TMP33:%.*]] = load i32, ptr [[I]], align 4
+// CHECK2-NEXT:    call void (...) @body(i32 noundef [[TMP33]])
+// CHECK2-NEXT:    br label %[[IF_END]]
+// CHECK2:       [[IF_END]]:
+// CHECK2-NEXT:    [[TMP34:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK2-NEXT:    [[TMP35:%.*]] = load i32, ptr [[DOTOMP_NI1]], align 4
+// CHECK2-NEXT:    [[CMP21:%.*]] = icmp ult i32 [[TMP34]], [[TMP35]]
+// CHECK2-NEXT:    br i1 [[CMP21]], label %[[IF_THEN22:.*]], label %[[IF_END27:.*]]
+// CHECK2:       [[IF_THEN22]]:
+// CHECK2-NEXT:    [[TMP36:%.*]] = load i32, ptr [[DOTOMP_LB1]], align 4
+// CHECK2-NEXT:    [[TMP37:%.*]] = load i32, ptr [[DOTOMP_ST1]], align 4
+// CHECK2-NEXT:    [[TMP38:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK2-NEXT:    [[MUL23:%.*]] = mul i32 [[TMP37]], [[TMP38]]
+// CHECK2-NEXT:    [[ADD24:%.*]] = add i32 [[TMP36]], [[MUL23]]
+// CHECK2-NEXT:    store i32 [[ADD24]], ptr [[DOTOMP_IV1]], align 4
+// CHECK2-NEXT:    [[TMP39:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_6]], align 4
+// CHECK2-NEXT:    [[TMP40:%.*]] = load i32, ptr [[DOTOMP_IV1]], align 4
+// CHECK2-NEXT:    [[TMP41:%.*]] = load i32, ptr [[DOTNEW_STEP8]], align 4
+// CHECK2-NEXT:    [[MUL25:%.*]] = mul i32 [[TMP40]], [[TMP41]]
+// CHECK2-NEXT:    [[ADD26:%.*]] = add i32 [[TMP39]], [[MUL25]]
+// CHECK2-NEXT:    store i32 [[ADD26]], ptr [[J]], align 4
+// CHECK2-NEXT:    [[TMP42:%.*]] = load i32, ptr [[J]], align 4
+// CHECK2-NEXT:    call void (...) @body(i32 noundef [[TMP42]])
+// CHECK2-NEXT:    br label %[[IF_END27]]
+// CHECK2:       [[IF_END27]]:
+// CHECK2-NEXT:    br label %[[FOR_INC:.*]]
+// CHECK2:       [[FOR_INC]]:
+// CHECK2-NEXT:    [[TMP43:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK2-NEXT:    [[INC:%.*]] = add i32 [[TMP43]], 1
+// CHECK2-NEXT:    store i32 [[INC]], ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK2-NEXT:    br label %[[FOR_COND]], !llvm.loop [[LOOP3:![0-9]+]]
+// CHECK2:       [[FOR_END]]:
+// CHECK2-NEXT:    ret void
+//
+//
+// CHECK2-LABEL: define dso_local void @foo3(
+// CHECK2-SAME: ) #[[ATTR0]] {
+// CHECK2-NEXT:  [[ENTRY:.*:]]
+// CHECK2-NEXT:    [[ARR:%.*]] = alloca [256 x double], align 16
+// CHECK2-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_LB0:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_ST0:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_NI0:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_IV0:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[J:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_LB1:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_ST1:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_NI1:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_IV1:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_TEMP_1:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_FUSE_MAX:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_FUSE_INDEX:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_LB03:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_ST04:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_NI05:%.*]] = alloca i64, align 8
+// CHECK2-NEXT:    [[DOTOMP_IV06:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[C:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[__RANGE2:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT:    [[__END2:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT:    [[__BEGIN2:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT:    [[DOTCAPTURE_EXPR_8:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT:    [[DOTCAPTURE_EXPR_10:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT:    [[DOTCAPTURE_EXPR_11:%.*]] = alloca i64, align 8
+// CHECK2-NEXT:    [[DOTOMP_LB116:%.*]] = alloca i64, align 8
+// CHECK2-NEXT:    [[DOTOMP_ST117:%.*]] = alloca i64, align 8
+// CHECK2-NEXT:    [[DOTOMP_NI118:%.*]] = alloca i64, align 8
+// CHECK2-NEXT:    [[DOTOMP_IV120:%.*]] = alloca i64, align 8
+// CHECK2-NEXT:    [[CC:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[__RANGE221:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT:    [[__END222:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT:    [[__BEGIN225:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT:    [[DOTCAPTURE_EXPR_27:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT:    [[DOTCAPTURE_EXPR_29:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT:    [[DOTCAPTURE_EXPR_30:%.*]] = alloca i64, align 8
+// CHECK2-NEXT:    [[DOTOMP_LB2:%.*]] = alloca i64, align 8
+// CHECK2-NEXT:    [[DOTOMP_ST2:%.*]] = alloca i64, align 8
+// CHECK2-NEXT:    [[DOTOMP_NI2:%.*]] = alloca i64, align 8
+// CHECK2-NEXT:    [[DOTOMP_IV2:%.*]] = alloca i64, align 8
+// CHECK2-NEXT:    [[DOTOMP_TEMP_140:%.*]] = alloca i64, align 8
+// CHECK2-NEXT:    [[DOTOMP_TEMP_2:%.*]] = alloca i64, align 8
+// CHECK2-NEXT:    [[DOTOMP_FUSE_MAX46:%.*]] = alloca i64, align 8
+// CHECK2-NEXT:    [[DOTOMP_FUSE_INDEX52:%.*]] = alloca i64, align 8
+// CHECK2-NEXT:    [[V:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT:    [[VV:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT:    store i32 0, ptr [[I]], align 4
+// CHECK2-NEXT:    store i32 0, ptr [[DOTOMP_LB0]], align 4
+// CHECK2-NEXT:    store i32 1, ptr [[DOTOMP_ST0]], align 4
+// CHECK2-NEXT:    store i32 128, ptr [[DOTOMP_NI0]], align 4
+// CHECK2-NEXT:    store i32 0, ptr [[J]], align 4
+// CHECK2-NEXT:    store i32 0, ptr [[DOTOMP_LB1]], align 4
+// CHECK2-NEXT:    store i32 1, ptr [[DOTOMP_ST1]], align 4
+// CHECK2-NEXT:    store i32 128, ptr [[DOTOMP_NI1]], align 4
+// CHECK2-NEXT:    [[TMP0:%.*]] = load i32, ptr [[DOTOMP_NI0]], align 4
+// CHECK2-NEXT:    store i32 [[TMP0]], ptr [[DOTOMP_TEMP_1]], align 4
+// CHECK2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTOMP_TEMP_1]], align 4
+// CHECK2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTOMP_NI1]], align 4
+// CHECK2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP1]], [[TMP2]]
+// CHECK2-NEXT:    br i1 [[CMP]], label %[[COND_TRUE:.*]], label %[[COND_FALSE:.*]]
+// CHECK2:       [[COND_TRUE]]:
+// CHECK2-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTOMP_TEMP_1]], align 4
+// CHECK2-NEXT:    br label %[[COND_END:.*]]
+// CHECK2:       [[COND_FALSE]]:
+// CHECK2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_NI1]], align 4
+// CHECK2-NEXT:    br label %[[COND_END]]
+// CHECK2:       [[COND_END]]:
+// CHECK2-NEXT:    [[COND:%.*]] = phi i32 [ [[TMP3]], %[[COND_TRUE]] ], [ [[TMP4]], %[[COND_FALSE]] ]
+// CHECK2-NEXT:    store i32 [[COND]], ptr [[DOTOMP_FUSE_MAX]], align 4
+// CHECK2-NEXT:    store i32 0, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK2-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_FUSE_MAX]], align 4
+// CHECK2-NEXT:    store i32 [[TMP5]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK2-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK2-NEXT:    [[SUB:%.*]] = sub nsw i32 [[TMP6]], 0
+// CHECK2-NEXT:    [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK2-NEXT:    [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1
+// CHECK2-NEXT:    store i32 [[SUB2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK2-NEXT:    store i32 0, ptr [[DOTOMP_LB03]], align 4
+// CHECK2-NEXT:    store i32 1, ptr [[DOTOMP_ST04]], align 4
+// CHECK2-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK2-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP7]], 1
+// CHECK2-NEXT:    [[CONV:%.*]] = sext i32 [[ADD]] to i64
+// CHECK2-NEXT:    store i64 [[CONV]], ptr [[DOTOMP_NI05]], align 8
+// CHECK2-NEXT:    store i32 42, ptr [[C]], align 4
+// CHECK2-NEXT:    store ptr [[ARR]], ptr [[__RANGE2]], align 8
+// CHECK2-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[__RANGE2]], align 8
+// CHECK2-NEXT:    [[ARRAYDECAY:%.*]] = getelementptr inbounds [256 x double], ptr [[TMP8]], i64 0, i64 0
+// CHECK2-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds double, ptr [[ARRAYDECAY]], i64 256
+// CHECK2-NEXT:    store ptr [[ADD_PTR]], ptr [[__END2]], align 8
+// CHECK2-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[__RANGE2]], align 8
+// CHECK2-NEXT:    [[ARRAYDECAY7:%.*]] = getelementptr inbounds [256 x double], ptr [[TMP9]], i64 0, i64 0
+// CHECK2-NEXT:    store ptr [[ARRAYDECAY7]], ptr [[__BEGIN2]], align 8
+// CHECK2-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[__RANGE2]], align 8
+// CHECK2-NEXT:    [[ARRAYDECAY9:%.*]] = getelementptr inbounds [256 x double], ptr [[TMP10]], i64 0, i64 0
+// CHECK2-NEXT:    store ptr [[ARRAYDECAY9]], ptr [[DOTCAPTURE_EXPR_8]], align 8
+// CHECK2-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[__END2]], align 8
+// CHECK2-NEXT:    store ptr [[TMP11]], ptr [[DOTCAPTURE_EXPR_10]], align 8
+// CHECK2-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[DOTCAPTURE_EXPR_10]], align 8
+// CHECK2-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[DOTCAPTURE_EXPR_8]], align 8
+// CHECK2-NEXT:    [[SUB_PTR_LHS_CAST:%.*]] = ptrtoint ptr [[TMP12]] to i64
+// CHECK2-NEXT:    [[SUB_PTR_RHS_CAST:%.*]] = ptrtoint ptr [[TMP13]] to i64
+// CHECK2-NEXT:    [[SUB_PTR_SUB:%.*]] = sub i64 [[SUB_PTR_LHS_CAST]], [[SUB_PTR_RHS_CAST]]
+// CHECK2-NEXT:    [[SUB_PTR_DIV:%.*]] = sdiv exact i64 [[SUB_PTR_SUB]], 8
+// CHECK2-NEXT:    [[SUB12:%.*]] = sub nsw i64 [[SUB_PTR_DIV]], 1
+// CHECK2-NEXT:    [[ADD13:%.*]] = add nsw i64 [[SUB12]], 1
+// CHECK2-NEXT:    [[DIV14:%.*]] = sdiv i64 [[ADD13]], 1
+// CHECK2-NEXT:    [[SUB15:%.*]] = sub nsw i64 [[DIV14]], 1
+// CHECK2-NEXT:    store i64 [[SUB15]], ptr [[DOTCAPTURE_EXPR_11]], align 8
+// CHECK2-NEXT:    store i64 0, ptr [[DOTOMP_LB116]], align 8
+// CHECK2-NEXT:    store i64 1, ptr [[DOTOMP_ST117]], align 8
+// CHECK2-NEXT:    [[TMP14:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_11]], align 8
+// CHECK2-NEXT:    [[ADD19:%.*]] = add nsw i64 [[TMP14]], 1
+// CHECK2-NEXT:    store i64 [[ADD19]], ptr [[DOTOMP_NI118]], align 8
+// CHECK2-NEXT:    store i32 37, ptr [[CC]], align 4
+// CHECK2-NEXT:    store ptr [[ARR]], ptr [[__RANGE221]], align 8
+// CHECK2-NEXT:    [[TMP15:%.*]] = load ptr, ptr [[__RANGE221]], align 8
+// CHECK2-NEXT:    [[ARRAYDECAY23:%.*]] = getelementptr inbounds [256 x double], ptr [[TMP15]], i64 0, i64 0
+// CHECK2-NEXT:    [[ADD_PTR24:%.*]] = getelementptr inbounds double, ptr [[ARRAYDECAY23]], i64 256
+// CHECK2-NEXT:    store ptr [[ADD_PTR24]], ptr [[__END222]], align 8
+// CHECK2-NEXT:    [[TMP16:%.*]] = load ptr, ptr [[__RANGE221]], align 8
+// CHECK2-NEXT:    [[ARRAYDECAY26:%.*]] = getelementptr inbounds [256 x double], ptr [[TMP16]], i64 0, i64 0
+// CHECK2-NEXT:    store ptr [[ARRAYDECAY26]], ptr [[__BEGIN225]], align 8
+// CHECK2-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[__RANGE221]], align 8
+// CHECK2-NEXT:    [[ARRAYDECAY28:%.*]] = getelementptr inbounds [256 x double], ptr [[TMP17]], i64 0, i64 0
+// CHECK2-NEXT:    store ptr [[ARRAYDECAY28]], ptr [[DOTCAPTURE_EXPR_27]], align 8
+// CHECK2-NEXT:    [[TMP18:%.*]] = load ptr, ptr [[__END222]], align 8
+// CHECK2-NEXT:    store ptr [[TMP18]], ptr [[DOTCAPTURE_EXPR_29]], align 8
+// CHECK2-NEXT:    [[TMP19:%.*]] = load ptr, ptr [[DOTCAPTURE_EXPR_29]], align 8
+// CHECK2-NEXT:    [[TMP20:%.*]] = load ptr, ptr [[DOTCAPTURE_EXPR_27]], align 8
+// CHECK2-NEXT:    [[SUB_PTR_LHS_CAST31:%.*]] = ptrtoint ptr [[TMP19]] to i64
+// CHECK2-NEXT:    [[SUB_PTR_RHS_CAST32:%.*]] = ptrtoint ptr [[TMP20]] to i64
+// CHECK2-NEXT:    [[SUB_PTR_SUB33:%.*]] = sub i64 [[SUB_PTR_LHS_CAST31]], [[SUB_PTR_RHS_CAST32]]
+// CHECK2-NEXT:    [[SUB_PTR_DIV34:%.*]] = sdiv exact i64 [[SUB_PTR_SUB33]], 8
+// CHECK2-NEXT:    [[SUB35:%.*]] = sub nsw i64 [[SUB_PTR_DIV34]], 1
+// CHECK2-NEXT:    [[ADD36:%.*]] = add nsw i64 [[SUB35]], 1
+// CHECK2-NEXT:    [[DIV37:%.*]] = sdiv i64 [[ADD36]], 1
+// CHECK2-NEXT:    [[SUB38:%.*]] = sub nsw i64 [[DIV37]], 1
+// CHECK2-NEXT:    store i64 [[SUB38]], ptr [[DOTCAPTURE_EXPR_30]], align 8
+// CHECK2-NEXT:    store i64 0, ptr [[DOTOMP_LB2]], align 8
+// CHECK2-NEXT:    store i64 1, ptr [[DOTOMP_ST2]], align 8
+// CHECK2-NEXT:    [[TMP21:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_30]], align 8
+// CHECK2-NEXT:    [[ADD39:%.*]] = add nsw i64 [[TMP21]], 1
+// CHECK2-NEXT:    store i64 [[ADD39]], ptr [[DOTOMP_NI2]], align 8
+// CHECK2-NEXT:    [[TMP22:%.*]] = load i64, ptr [[DOTOMP_NI05]], align 8
+// CHECK2-NEXT:    store i64 [[TMP22]], ptr [[DOTOMP_TEMP_140]], align 8
+// CHECK2-NEXT:    [[TMP23:%.*]] = load i64, ptr [[DOTOMP_TEMP_140]], align 8
+// CHECK2-NEXT:    [[TMP24:%.*]] = load i64, ptr [[DOTOMP_NI118]], align 8
+// CHECK2-NEXT:    [[CMP41:%.*]] = icmp sgt i64 [[TMP23]], [[TMP24]]
+// CHECK2-NEXT:    br i1 [[CMP41]], label %[[COND_TRUE42:.*]], label %[[COND_FALSE43:.*]]
+// CHECK2:       [[COND_TRUE42]]:
+// CHECK2-NEXT:    [[TMP25:%.*]] = load i64, ptr [[DOTOMP_TEMP_140]], align 8
+// CHECK2-NEXT:    br label %[[COND_END44:.*]]
+// CHECK2:       [[COND_FALSE43]]:
+// CHECK2-NEXT:    [[TMP26:%.*]] = load i64, ptr [[DOTOMP_NI118]], align 8
+// CHECK2-NEXT:    br label %[[COND_END44]]
+// CHECK2:       [[COND_END44]]:
+// CHECK2-NEXT:    [[COND45:%.*]] = phi i64 [ [[TMP25]], %[[COND_TRUE42]] ], [ [[TMP26]], %[[COND_FALSE43]] ]
+// CHECK2-NEXT:    store i64 [[COND45]], ptr [[DOTOMP_TEMP_2]], align 8
+// CHECK2-NEXT:    [[TMP27:%.*]] = load i64, ptr [[DOTOMP_TEMP_2]], align 8
+// CHECK2-NEXT:    [[TMP28:%.*]] = load i64, ptr [[DOTOMP_NI2]], align 8
+// CHECK2-NEXT:    [[CMP47:%.*]] = icmp sgt i64 [[TMP27]], [[TMP28]]
+// CHECK2-NEXT:    br i1 [[CMP47]], label %[[COND_TRUE48:.*]], label %[[COND_FALSE49:.*]]
+// CHECK2:       [[COND_TRUE48]]:
+// CHECK2-NEXT:    [[TMP29:%.*]] = load i64, ptr [[DOTOMP_TEMP_2]], align 8
+// CHECK2-NEXT:    br label %[[COND_END50:.*]]
+// CHECK2:       [[COND_FALSE49]]:
+// CHECK2-NEXT:    [[TMP30:%.*]] = load i64, ptr [[DOTOMP_NI2]], align 8
+// CHECK2-NEXT:    br label %[[COND_END50]]
+// CHECK2:       [[COND_END50]]:
+// CHECK2-NEXT:    [[COND51:%.*]] = phi i64 [ [[TMP29]], %[[COND_TRUE48]] ], [ [[TMP30]], %[[COND_FALSE49]] ]
+// CHECK2-NEXT:    store i64 [[COND51]], ptr [[DOTOMP_FUSE_MAX46]], align 8
+// CHECK2-NEXT:    store i64 0, ptr [[DOTOMP_FUSE_INDEX52]], align 8
+// CHECK2-NEXT:    br label %[[FOR_COND:.*]]
+// CHECK2:       [[FOR_COND]]:
+// CHECK2-NEXT:    [[TMP31:%.*]] = load i64, ptr [[DOTOMP_FUSE_INDEX52]], align 8
+// CHECK2-NEXT:    [[TMP32:%.*]] = load i64, ptr [[DOTOMP_FUSE_MAX46]], align 8
+// CHECK2-NEXT:    [[CMP53:%.*]] = icmp slt i64 [[TMP31]], [[TMP32]]
+// CHECK2-NEXT:    br i1 [[CMP53]], label %[[FOR_BODY:.*]], label %[[FOR_END:.*]]
+// CHECK2:       [[FOR_BODY]]:
+// CHECK2-NEXT:    [[TMP33:%.*]] = load i64, ptr [[DOTOMP_FUSE_INDEX52]], align 8
+// CHECK2-NEXT:    [[TMP34:%.*]] = load i64, ptr [[DOTOMP_NI05]], align 8
+// CHECK2-NEXT:    [[CMP54:%.*]] = icmp slt i64 [[TMP33]], [[TMP34]]
+// CHECK2-NEXT:    br i1 [[CMP54]], label %[[IF_THEN:.*]], label %[[IF_END74:.*]]
+// CHECK2:       [[IF_THEN]]:
+// CHECK2-NEXT:    [[TMP35:%.*]] = load i32, ptr [[DOTOMP_LB03]], align 4
+// CHECK2-NEXT:    [[CONV55:%.*]] = sext i32 [[TMP35]] to i64
+// CHECK2-NEXT:    [[TMP36:%.*]] = load i32, ptr [[DOTOMP_ST04]], align 4
+// CHECK2-NEXT:    [[CONV56:%.*]] = sext i32 [[TMP36]] to i64
+// CHECK2-NEXT:    [[TMP37:%.*]] = load i64, ptr [[DOTOMP_FUSE_INDEX52]], align 8
+// CHECK2-NEXT:    [[MUL:%.*]] = mul nsw i64 [[CONV56]], [[TMP37]]
+// CHECK2-NEXT:    [[ADD57:%.*]] = add nsw i64 [[CONV55]], [[MUL]]
+// CHECK2-NEXT:    [[CONV58:%.*]] = trunc i64 [[ADD57]] to i32
+// CHECK2-NEXT:    store i32 [[CONV58]], ptr [[DOTOMP_IV06]], align 4
+// CHECK2-NEXT:    [[TMP38:%.*]] = load i32, ptr [[DOTOMP_IV06]], align 4
+// CHECK2-NEXT:    [[MUL59:%.*]] = mul nsw i32 [[TMP38]], 1
+// CHECK2-NEXT:    [[ADD60:%.*]] = add nsw i32 0, [[MUL59]]
+// CHECK2-NEXT:    store i32 [[ADD60]], ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK2-NEXT:    [[TMP39:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK2-NEXT:    [[TMP40:%.*]] = load i32, ptr [[DOTOMP_NI0]], align 4
+// CHECK2-NEXT:    [[CMP61:%.*]] = icmp slt i32 [[TMP39]], [[TMP40]]
+// CHECK2-NEXT:    br i1 [[CMP61]], label %[[IF_THEN62:.*]], label %[[IF_END:.*]]
+// CHECK2:       [[IF_THEN62]]:
+// CHECK2-NEXT:    [[TMP41:%.*]] = load i32, ptr [[DOTOMP_LB0]], align 4
+// CHECK2-NEXT:    [[TMP42:%.*]] = load i32, ptr [[DOTOMP_ST0]], align 4
+// CHECK2-NEXT:    [[TMP43:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK2-NEXT:    [[MUL63:%.*]] = mul nsw i32 [[TMP42]], [[TMP43]]
+// CHECK2-NEXT:    [[ADD64:%.*]] = add nsw i32 [[TMP41]], [[MUL63]]
+// CHECK2-NEXT:    store i32 [[ADD64]], ptr [[DOTOMP_IV0]], align 4
+// CHECK2-NEXT:    [[TMP44:%.*]] = load i32, ptr [[DOTOMP_IV0]], align 4
+// CHECK2-NEXT:    [[MUL65:%.*]] = mul nsw i32 [[TMP44]], 1
+// CHECK2-NEXT:    [[ADD66:%.*]] = add nsw i32 0, [[MUL65]]
+// CHECK2-NEXT:    store i32 [[ADD66]], ptr [[I]], align 4
+// CHECK2-NEXT:    [[TMP45:%.*]] = load i32, ptr [[I]], align 4
+// CHECK2-NEXT:    call void (...) @body(i32 noundef [[TMP45]])
+// CHECK2-NEXT:    br label %[[IF_END]]
+// CHECK2:       [[IF_END]]:
+// CHECK2-NEXT:    [[TMP46:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK2-NEXT:    [[TMP47:%.*]] = load i32, ptr [[DOTOMP_NI1]], align 4
+// CHECK2-NEXT:    [[CMP67:%.*]] = icmp slt i32 [[TMP46]], [[TMP47]]
+// CHECK2-NEXT:    br i1 [[CMP67]], label %[[IF_THEN68:.*]], label %[[IF_END73:.*]]
+// CHECK2:       [[IF_THEN68]]:
+// CHECK2-NEXT:    [[TMP48:%.*]] = load i32, ptr [[DOTOMP_LB1]], align 4
+// CHECK2-NEXT:    [[TMP49:%.*]] = load i32, ptr [[DOTOMP_ST1]], align 4
+// CHECK2-NEXT:    [[TMP50:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK2-NEXT:    [[MUL69:%.*]] = mul nsw i32 [[TMP49]], [[TMP50]]
+// CHECK2-NEXT:    [[ADD70:%.*]] = add nsw i32 [[TMP48]], [[MUL69]]
+// CHECK2-NEXT:    store i32 [[ADD70]], ptr [[DOTOMP_IV1]], align 4
+// CHECK2-NEXT:    [[TMP51:%.*]] = load i32, ptr [[DOTOMP_IV1]], align 4
+// CHECK2-NEXT:    [[MUL71:%.*]] = mul nsw i32 [[TMP51]], 2
+// CHECK2-NEXT:    [[ADD72:%.*]] = add nsw i32 0, [[MUL71]]
+// CHECK2-NEXT:    store i32 [[ADD72]], ptr [[J]], align 4
+// CHECK2-NEXT:    [[TMP52:%.*]] = load i32, ptr [[J]], align 4
+// CHECK2-NEXT:    call void (...) @body(i32 noundef [[TMP52]])
+// CHECK2-NEXT:    br label %[[IF_END73]]
+// CHECK2:       [[IF_END73]]:
+// CHECK2-NEXT:    br label %[[IF_END74]]
+// CHECK2:       [[IF_END74]]:
+// CHECK2-NEXT:    [[TMP53:%.*]] = load i64, ptr [[DOTOMP_FUSE_INDEX52]], align 8
+// CHECK2-NEXT:    [[TMP54:%.*]] = load i64, ptr [[DOTOMP_NI118]], align 8
+// CHECK2-NEXT:    [[CMP75:%.*]] = icmp slt i64 [[TMP53]], [[TMP54]]
+// CHECK2-NEXT:    br i1 [[CMP75]], label %[[IF_THEN76:.*]], label %[[IF_END81:.*]]
+// CHECK2:       [[IF_THEN76]]:
+// CHECK2-NEXT:    [[TMP55:%.*]] = load i64, ptr [[DOTOMP_LB116]], align 8
+// CHECK2-NEXT:    [[TMP56:%.*]] = load i64, ptr [[DOTOMP_ST117]], align 8
+// CHECK2-NEXT:    [[TMP57:%.*]] = load i64, ptr [[DOTOMP_FUSE_INDEX52]], align 8
+// CHECK2-NEXT:    [[MUL77:%.*]] = mul nsw i64 [[TMP56]], [[TMP57]]
+// CHECK2-NEXT:    [[ADD78:%.*]] = add nsw i64 [[TMP55]], [[MUL77]]
+// CHECK2-NEXT:    store i64 [[ADD78]], ptr [[DOTOMP_IV120]], align 8
+// CHECK2-NEXT:    [[TMP58:%.*]] = load ptr, ptr [[DOTCAPTURE_EXPR_8]], align 8
+// CHECK2-NEXT:    [[TMP59:%.*]] = load i64, ptr [[DOTOMP_IV120]], align 8
+// CHECK2-NEXT:    [[MUL79:%.*]] = mul nsw i64 [[TMP59]], 1
+// CHECK2-NEXT:    [[ADD_PTR80:%.*]] = getelementptr inbounds double, ptr [[TMP58]], i64 [[MUL79]]
+// CHECK2-NEXT:    store ptr [[ADD_PTR80]], ptr [[__BEGIN2]], align 8
+// CHECK2-NEXT:    [[TMP60:%.*]] = load ptr, ptr [[__BEGIN2]], align 8
+// CHECK2-NEXT:    store ptr [[TMP60]], ptr [[V]], align 8
+// CHECK2-NEXT:    [[TMP61:%.*]] = load i32, ptr [[C]], align 4
+// CHECK2-NEXT:    [[TMP62:%.*]] = load ptr, ptr [[V]], align 8
+// CHECK2-NEXT:    [[TMP63:%.*]] = load double, ptr [[TMP62]], align 8
+// CHECK2-NEXT:    call void (...) @body(i32 noundef [[TMP61]], double noundef [[TMP63]])
+// CHECK2-NEXT:    br label %[[IF_END81]]
+// CHECK2:       [[IF_END81]]:
+// CHECK2-NEXT:    [[TMP64:%.*]] = load i64, ptr [[DOTOMP_FUSE_INDEX52]], align 8
+// CHECK2-NEXT:    [[TMP65:%.*]] = load i64, ptr [[DOTOMP_NI2]], align 8
+// CHECK2-NEXT:    [[CMP82:%.*]] = icmp slt i64 [[TMP64]], [[TMP65]]
+// CHECK2-NEXT:    br i1 [[CMP82]], label %[[IF_THEN83:.*]], label %[[IF_END88:.*]]
+// CHECK2:       [[IF_THEN83]]:
+// CHECK2-NEXT:    [[TMP66:%.*]] = load i64, ptr [[DOTOMP_LB2]], align 8
+// CHECK2-NEXT:    [[TMP67:%.*]] = load i64, ptr [[DOTOMP_ST2]], align 8
+// CHECK2-NEXT:    [[TMP68:%.*]] = load i64, ptr [[DOTOMP_FUSE_INDEX52]], align 8
+// CHECK2-NEXT:    [[MUL84:%.*]] = mul nsw i64 [[TMP67]], [[TMP68]]
+// CHECK2-NEXT:    [[ADD85:%.*]] = add nsw i64 [[TMP66]], [[MUL84]]
+// CHECK2-NEXT:    store i64 [[ADD85]], ptr [[DOTOMP_IV2]], align 8
+// CHECK2-NEXT:    [[TMP69:%.*]] = load ptr, ptr [[DOTCAPTURE_EXPR_27]], align 8
+// CHECK2-NEXT:    [[TMP70:%.*]] = load i64, ptr [[DOTOMP_IV2]], align 8
+// CHECK2-NEXT:    [[MUL86:%.*]] = mul nsw i64 [[TMP70]], 1
+// CHECK2-NEXT:    [[ADD_PTR87:%.*]] = getelementptr inbounds double, ptr [[TMP69]], i64 [[MUL86]]
+// CHECK2-NEXT:    store ptr [[ADD_PTR87]], ptr [[__BEGIN225]], align 8
+// CHECK2-NEXT:    [[TMP71:%.*]] = load ptr, ptr [[__BEGIN225]], align 8
+// CHECK2-NEXT:    store ptr [[TMP71]], ptr [[VV]], align 8
+// CHECK2-NEXT:    [[TMP72:%.*]] = load i32, ptr [[CC]], align 4
+// CHECK2-NEXT:    [[TMP73:%.*]] = load ptr, ptr [[VV]], align 8
+// CHECK2-NEXT:    [[TMP74:%.*]] = load double, ptr [[TMP73]], align 8
+// CHECK2-NEXT:    call void (...) @body(i32 noundef [[TMP72]], double noundef [[TMP74]])
+// CHECK2-NEXT:    br label %[[IF_END88]]
+// CHECK2:       [[IF_END88]]:
+// CHECK2-NEXT:    br label %[[FOR_INC:.*]]
+// CHECK2:       [[FOR_INC]]:
+// CHECK2-NEXT:    [[TMP75:%.*]] = load i64, ptr [[DOTOMP_FUSE_INDEX52]], align 8
+// CHECK2-NEXT:    [[INC:%.*]] = add nsw i64 [[TMP75]], 1
+// CHECK2-NEXT:    store i64 [[INC]], ptr [[DOTOMP_FUSE_INDEX52]], align 8
+// CHECK2-NEXT:    br label %[[FOR_COND]], !llvm.loop [[LOOP5:![0-9]+]]
+// CHECK2:       [[FOR_END]]:
+// CHECK2-NEXT:    ret void
+//
+//
+// CHECK2-LABEL: define dso_local void @foo4(
+// CHECK2-SAME: ) #[[ATTR0]] {
+// CHECK2-NEXT:  [[ENTRY:.*:]]
+// CHECK2-NEXT:    [[ARR:%.*]] = alloca [256 x double], align 16
+// CHECK2-NEXT:    [[J:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_LB0:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_ST0:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_NI0:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_IV0:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[K:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_LB1:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_ST1:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_NI1:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_IV1:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_TEMP_1:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_FUSE_MAX:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_FUSE_INDEX:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[C:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[__RANGE2:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT:    [[__BEGIN2:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT:    [[__END2:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT:    [[V:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT:    store i32 0, ptr [[J]], align 4
+// CHECK2-NEXT:    store i32 0, ptr [[DOTOMP_LB0]], align 4
+// CHECK2-NEXT:    store i32 1, ptr [[DOTOMP_ST0]], align 4
+// CHECK2-NEXT:    store i32 128, ptr [[DOTOMP_NI0]], align 4
+// CHECK2-NEXT:    store i32 0, ptr [[K]], align 4
+// CHECK2-NEXT:    store i32 0, ptr [[DOTOMP_LB1]], align 4
+// CHECK2-NEXT:    store i32 1, ptr [[DOTOMP_ST1]], align 4
+// CHECK2-NEXT:    store i32 64, ptr [[DOTOMP_NI1]], align 4
+// CHECK2-NEXT:    [[TMP0:%.*]] = load i32, ptr [[DOTOMP_NI0]], align 4
+// CHECK2-NEXT:    store i32 [[TMP0]], ptr [[DOTOMP_TEMP_1]], align 4
+// CHECK2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTOMP_TEMP_1]], align 4
+// CHECK2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTOMP_NI1]], align 4
+// CHECK2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP1]], [[TMP2]]
+// CHECK2-NEXT:    br i1 [[CMP]], label %[[COND_TRUE:.*]], label %[[COND_FALSE:.*]]
+// CHECK2:       [[COND_TRUE]]:
+// CHECK2-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTOMP_TEMP_1]], align 4
+// CHECK2-NEXT:    br label %[[COND_END:.*]]
+// CHECK2:       [[COND_FALSE]]:
+// CHECK2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_NI1]], align 4
+// CHECK2-NEXT:    br label %[[COND_END]]
+// CHECK2:       [[COND_END]]:
+// CHECK2-NEXT:    [[COND:%.*]] = phi i32 [ [[TMP3]], %[[COND_TRUE]] ], [ [[TMP4]], %[[COND_FALSE]] ]
+// CHECK2-NEXT:    store i32 [[COND]], ptr [[DOTOMP_FUSE_MAX]], align 4
+// CHECK2-NEXT:    store i32 0, ptr [[I]], align 4
+// CHECK2-NEXT:    br label %[[FOR_COND:.*]]
+// CHECK2:       [[FOR_COND]]:
+// CHECK2-NEXT:    [[TMP5:%.*]] = load i32, ptr [[I]], align 4
+// CHECK2-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP5]], 128
+// CHECK2-NEXT:    br i1 [[CMP1]], label %[[FOR_BODY:.*]], label %[[FOR_END:.*]]
+// CHECK2:       [[FOR_BODY]]:
+// CHECK2-NEXT:    [[TMP6:%.*]] = load i32, ptr [[I]], align 4
+// CHECK2-NEXT:    call void (...) @body(i32 noundef [[TMP6]])
+// CHECK2-NEXT:    br label %[[FOR_INC:.*]]
+// CHECK2:       [[FOR_INC]]:
+// CHECK2-NEXT:    [[TMP7:%.*]] = load i32, ptr [[I]], align 4
+// CHECK2-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP7]], 1
+// CHECK2-NEXT:    store i32 [[INC]], ptr [[I]], align 4
+// CHECK2-NEXT:    br label %[[FOR_COND]], !llvm.loop [[LOOP6:![0-9]+]]
+// CHECK2:       [[FOR_END]]:
+// CHECK2-NEXT:    store i32 0, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK2-NEXT:    br label %[[FOR_COND2:.*]]
+// CHECK2:       [[FOR_COND2]]:
+// CHECK2-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK2-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_FUSE_MAX]], align 4
+// CHECK2-NEXT:    [[CMP3:%.*]] = icmp slt i32 [[TMP8]], [[TMP9]]
+// CHECK2-NEXT:    br i1 [[CMP3]], label %[[FOR_BODY4:.*]], label %[[FOR_END17:.*]]
+// CHECK2:       [[FOR_BODY4]]:
+// CHECK2-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK2-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_NI0]], align 4
+// CHECK2-NEXT:    [[CMP5:%.*]] = icmp slt i32 [[TMP10]], [[TMP11]]
+// CHECK2-NEXT:    br i1 [[CMP5]], label %[[IF_THEN:.*]], label %[[IF_END:.*]]
+// CHECK2:       [[IF_THEN]]:
+// CHECK2-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_LB0]], align 4
+// CHECK2-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_ST0]], align 4
+// CHECK2-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK2-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP13]], [[TMP14]]
+// CHECK2-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP12]], [[MUL]]
+// CHECK2-NEXT:    store i32 [[ADD]], ptr [[DOTOMP_IV0]], align 4
+// CHECK2-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV0]], align 4
+// CHECK2-NEXT:    [[MUL6:%.*]] = mul nsw i32 [[TMP15]], 2
+// CHECK2-NEXT:    [[ADD7:%.*]] = add nsw i32 0, [[MUL6]]
+// CHECK2-NEXT:    store i32 [[ADD7]], ptr [[J]], align 4
+// CHECK2-NEXT:    [[TMP16:%.*]] = load i32, ptr [[J]], align 4
+// CHECK2-NEXT:    call void (...) @body(i32 noundef [[TMP16]])
+// CHECK2-NEXT:    br label %[[IF_END]]
+// CHECK2:       [[IF_END]]:
+// CHECK2-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK2-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_NI1]], align 4
+// CHECK2-NEXT:    [[CMP8:%.*]] = icmp slt i32 [[TMP17]], [[TMP18]]
+// CHECK2-NEXT:    br i1 [[CMP8]], label %[[IF_THEN9:.*]], label %[[IF_END14:.*]]
+// CHECK2:       [[IF_THEN9]]:
+// CHECK2-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_LB1]], align 4
+// CHECK2-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_ST1]], align 4
+// CHECK2-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK2-NEXT:    [[MUL10:%.*]] = mul nsw i32 [[TMP20]], [[TMP21]]
+// CHECK2-NEXT:    [[ADD11:%.*]] = add nsw i32 [[TMP19]], [[MUL10]]
+// CHECK2-NEXT:    store i32 [[ADD11]], ptr [[DOTOMP_IV1]], align 4
+// CHECK2-NEXT:    [[TMP22:%.*]] = load i32, ptr [[DOTOMP_IV1]], align 4
+// CHECK2-NEXT:    [[MUL12:%.*]] = mul nsw i32 [[TMP22]], 1
+// CHECK2-NEXT:    [[ADD13:%.*]] = add nsw i32 0, [[MUL12]]
+// CHECK2-NEXT:    store i32 [[ADD13]], ptr [[K]], align 4
+// CHECK2-NEXT:    [[TMP23:%.*]] = load i32, ptr [[K]], align 4
+// CHECK2-NEXT:    call void (...) @body(i32 noundef [[TMP23]])
+// CHECK2-NEXT:    br label %[[IF_END14]]
+// CHECK2:       [[IF_END14]]:
+// CHECK2-NEXT:    br label %[[FOR_INC15:.*]]
+// CHECK2:       [[FOR_INC15]]:
+// CHECK2-NEXT:    [[TMP24:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK2-NEXT:    [[INC16:%.*]] = add nsw i32 [[TMP24]], 1
+// CHECK2-NEXT:    store i32 [[INC16]], ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK2-NEXT:    br label %[[FOR_COND2]], !llvm.loop [[LOOP7:![0-9]+]]
+// CHECK2:       [[FOR_END17]]:
+// CHECK2-NEXT:    store i32 42, ptr [[C]], align 4
+// CHECK2-NEXT:    store ptr [[ARR]], ptr [[__RANGE2]], align 8
+// CHECK2-NEXT:    [[TMP25:%.*]] = load ptr, ptr [[__RANGE2]], align 8
+// CHECK2-NEXT:    [[ARRAYDECAY:%.*]] = getelementptr inbounds [256 x double], ptr [[TMP25]], i64 0, i64 0
+// CHECK2-NEXT:    store ptr [[ARRAYDECAY]], ptr [[__BEGIN2]], align 8
+// CHECK2-NEXT:    [[TMP26:%.*]] = load ptr, ptr [[__RANGE2]], align 8
+// CHECK2-NEXT:    [[ARRAYDECAY18:%.*]] = getelementptr inbounds [256 x double], ptr [[TMP26]], i64 0, i64 0
+// CHECK2-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds double, ptr [[ARRAYDECAY18]], i64 256
+// CHECK2-NEXT:    store ptr [[ADD_PTR]], ptr [[__END2]], align 8
+// CHECK2-NEXT:    br label %[[FOR_COND19:.*]]
+// CHECK2:       [[FOR_COND19]]:
+// CHECK2-NEXT:    [[TMP27:%.*]] = load ptr, ptr [[__BEGIN2]], align 8
+// CHECK2-NEXT:    [[TMP28:%.*]] = load ptr, ptr [[__END2]], align 8
+// CHECK2-NEXT:    [[CMP20:%.*]] = icmp ne ptr [[TMP27]], [[TMP28]]
+// CHECK2-NEXT:    br i1 [[CMP20]], label %[[FOR_BODY21:.*]], label %[[FOR_END23:.*]]
+// CHECK2:       [[FOR_BODY21]]:
+// CHECK2-NEXT:    [[TMP29:%.*]] = load ptr, ptr [[__BEGIN2]], align 8
+// CHECK2-NEXT:    store ptr [[TMP29]], ptr [[V]], align 8
+// CHECK2-NEXT:    [[TMP30:%.*]] = load i32, ptr [[C]], align 4
+// CHECK2-NEXT:    [[TMP31:%.*]] = load ptr, ptr [[V]], align 8
+// CHECK2-NEXT:    [[TMP32:%.*]] = load double, ptr [[TMP31]], align 8
+// CHECK2-NEXT:    call void (...) @body(i32 noundef [[TMP30]], double noundef [[TMP32]])
+// CHECK2-NEXT:    br label %[[FOR_INC22:.*]]
+// CHECK2:       [[FOR_INC22]]:
+// CHECK2-NEXT:    [[TMP33:%.*]] = load ptr, ptr [[__BEGIN2]], align 8
+// CHECK2-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw double, ptr [[TMP33]], i32 1
+// CHECK2-NEXT:    store ptr [[INCDEC_PTR]], ptr [[__BEGIN2]], align 8
+// CHECK2-NEXT:    br label %[[FOR_COND19]]
+// CHECK2:       [[FOR_END23]]:
+// CHECK2-NEXT:    ret void
+//
+//
+// CHECK2-LABEL: define dso_local void @foo5(
+// CHECK2-SAME: ) #[[ATTR0]] {
+// CHECK2-NEXT:  [[ENTRY:.*:]]
+// CHECK2-NEXT:    [[ARR:%.*]] = alloca [256 x double], align 16
+// CHECK2-NEXT:    [[J:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_LB0:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_ST0:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_NI0:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_IV0:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[K:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_LB1:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_ST1:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_NI1:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_IV1:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_TEMP_1:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_FUSE_MAX:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_FUSE_INDEX:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_LB03:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_ST04:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_NI05:%.*]] = alloca i64, align 8
+// CHECK2-NEXT:    [[DOTOMP_IV06:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[C:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[__RANGE2:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT:    [[__END2:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT:    [[__BEGIN2:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT:    [[DOTCAPTURE_EXPR_8:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT:    [[DOTCAPTURE_EXPR_10:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT:    [[DOTCAPTURE_EXPR_11:%.*]] = alloca i64, align 8
+// CHECK2-NEXT:    [[DOTOMP_LB116:%.*]] = alloca i64, align 8
+// CHECK2-NEXT:    [[DOTOMP_ST117:%.*]] = alloca i64, align 8
+// CHECK2-NEXT:    [[DOTOMP_NI118:%.*]] = alloca i64, align 8
+// CHECK2-NEXT:    [[DOTOMP_IV120:%.*]] = alloca i64, align 8
+// CHECK2-NEXT:    [[DOTOMP_TEMP_121:%.*]] = alloca i64, align 8
+// CHECK2-NEXT:    [[DOTOMP_FUSE_MAX22:%.*]] = alloca i64, align 8
+// CHECK2-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_FUSE_INDEX29:%.*]] = alloca i64, align 8
+// CHECK2-NEXT:    [[V:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT:    [[CC:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[__RANGE264:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT:    [[__BEGIN265:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT:    [[__END267:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT:    [[VV:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT:    store i32 0, ptr [[J]], align 4
+// CHECK2-NEXT:    store i32 0, ptr [[DOTOMP_LB0]], align 4
+// CHECK2-NEXT:    store i32 1, ptr [[DOTOMP_ST0]], align 4
+// CHECK2-NEXT:    store i32 128, ptr [[DOTOMP_NI0]], align 4
+// CHECK2-NEXT:    store i32 0, ptr [[K]], align 4
+// CHECK2-NEXT:    store i32 0, ptr [[DOTOMP_LB1]], align 4
+// CHECK2-NEXT:    store i32 1, ptr [[DOTOMP_ST1]], align 4
+// CHECK2-NEXT:    store i32 512, ptr [[DOTOMP_NI1]], align 4
+// CHECK2-NEXT:    [[TMP0:%.*]] = load i32, ptr [[DOTOMP_NI0]], align 4
+// CHECK2-NEXT:    store i32 [[TMP0]], ptr [[DOTOMP_TEMP_1]], align 4
+// CHECK2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTOMP_TEMP_1]], align 4
+// CHECK2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTOMP_NI1]], align 4
+// CHECK2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP1]], [[TMP2]]
+// CHECK2-NEXT:    br i1 [[CMP]], label %[[COND_TRUE:.*]], label %[[COND_FALSE:.*]]
+// CHECK2:       [[COND_TRUE]]:
+// CHECK2-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTOMP_TEMP_1]], align 4
+// CHECK2-NEXT:    br label %[[COND_END:.*]]
+// CHECK2:       [[COND_FALSE]]:
+// CHECK2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_NI1]], align 4
+// CHECK2-NEXT:    br label %[[COND_END]]
+// CHECK2:       [[COND_END]]:
+// CHECK2-NEXT:    [[COND:%.*]] = phi i32 [ [[TMP3]], %[[COND_TRUE]] ], [ [[TMP4]], %[[COND_FALSE]] ]
+// CHECK2-NEXT:    store i32 [[COND]], ptr [[DOTOMP_FUSE_MAX]], align 4
+// CHECK2-NEXT:    store i32 0, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK2-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_FUSE_MAX]], align 4
+// CHECK2-NEXT:    store i32 [[TMP5]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK2-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK2-NEXT:    [[SUB:%.*]] = sub nsw i32 [[TMP6]], 0
+// CHECK2-NEXT:    [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK2-NEXT:    [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1
+// CHECK2-NEXT:    store i32 [[SUB2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK2-NEXT:    store i32 0, ptr [[DOTOMP_LB03]], align 4
+// CHECK2-NEXT:    store i32 1, ptr [[DOTOMP_ST04]], align 4
+// CHECK2-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK2-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP7]], 1
+// CHECK2-NEXT:    [[CONV:%.*]] = sext i32 [[ADD]] to i64
+// CHECK2-NEXT:    store i64 [[CONV]], ptr [[DOTOMP_NI05]], align 8
+// CHECK2-NEXT:    store i32 42, ptr [[C]], align 4
+// CHECK2-NEXT:    store ptr [[ARR]], ptr [[__RANGE2]], align 8
+// CHECK2-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[__RANGE2]], align 8
+// CHECK2-NEXT:    [[ARRAYDECAY:%.*]] = getelementptr inbounds [256 x double], ptr [[TMP8]], i64 0, i64 0
+// CHECK2-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds double, ptr [[ARRAYDECAY]], i64 256
+// CHECK2-NEXT:    store ptr [[ADD_PTR]], ptr [[__END2]], align 8
+// CHECK2-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[__RANGE2]], align 8
+// CHECK2-NEXT:    [[ARRAYDECAY7:%.*]] = getelementptr inbounds [256 x double], ptr [[TMP9]], i64 0, i64 0
+// CHECK2-NEXT:    store ptr [[ARRAYDECAY7]], ptr [[__BEGIN2]], align 8
+// CHECK2-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[__RANGE2]], align 8
+// CHECK2-NEXT:    [[ARRAYDECAY9:%.*]] = getelementptr inbounds [256 x double], ptr [[TMP10]], i64 0, i64 0
+// CHECK2-NEXT:    store ptr [[ARRAYDECAY9]], ptr [[DOTCAPTURE_EXPR_8]], align 8
+// CHECK2-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[__END2]], align 8
+// CHECK2-NEXT:    store ptr [[TMP11]], ptr [[DOTCAPTURE_EXPR_10]], align 8
+// CHECK2-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[DOTCAPTURE_EXPR_10]], align 8
+// CHECK2-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[DOTCAPTURE_EXPR_8]], align 8
+// CHECK2-NEXT:    [[SUB_PTR_LHS_CAST:%.*]] = ptrtoint ptr [[TMP12]] to i64
+// CHECK2-NEXT:    [[SUB_PTR_RHS_CAST:%.*]] = ptrtoint ptr [[TMP13]] to i64
+// CHECK2-NEXT:    [[SUB_PTR_SUB:%.*]] = sub i64 [[SUB_PTR_LHS_CAST]], [[SUB_PTR_RHS_CAST]]
+// CHECK2-NEXT:    [[SUB_PTR_DIV:%.*]] = sdiv exact i64 [[SUB_PTR_SUB]], 8
+// CHECK2-NEXT:    [[SUB12:%.*]] = sub nsw i64 [[SUB_PTR_DIV]], 1
+// CHECK2-NEXT:    [[ADD13:%.*]] = add nsw i64 [[SUB12]], 1
+// CHECK2-NEXT:    [[DIV14:%.*]] = sdiv i64 [[ADD13]], 1
+// CHECK2-NEXT:    [[SUB15:%.*]] = sub nsw i64 [[DIV14]], 1
+// CHECK2-NEXT:    store i64 [[SUB15]], ptr [[DOTCAPTURE_EXPR_11]], align 8
+// CHECK2-NEXT:    store i64 0, ptr [[DOTOMP_LB116]], align 8
+// CHECK2-NEXT:    store i64 1, ptr [[DOTOMP_ST117]], align 8
+// CHECK2-NEXT:    [[TMP14:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_11]], align 8
+// CHECK2-NEXT:    [[ADD19:%.*]] = add nsw i64 [[TMP14]], 1
+// CHECK2-NEXT:    store i64 [[ADD19]], ptr [[DOTOMP_NI118]], align 8
+// CHECK2-NEXT:    [[TMP15:%.*]] = load i64, ptr [[DOTOMP_NI05]], align 8
+// CHECK2-NEXT:    store i64 [[TMP15]], ptr [[DOTOMP_TEMP_121]], align 8
+// CHECK2-NEXT:    [[TMP16:%.*]] = load i64, ptr [[DOTOMP_TEMP_121]], align 8
+// CHECK2-NEXT:    [[TMP17:%.*]] = load i64, ptr [[DOTOMP_NI118]], align 8
+// CHECK2-NEXT:    [[CMP23:%.*]] = icmp sgt i64 [[TMP16]], [[TMP17]]
+// CHECK2-NEXT:    br i1 [[CMP23]], label %[[COND_TRUE24:.*]], label %[[COND_FALSE25:.*]]
+// CHECK2:       [[COND_TRUE24]]:
+// CHECK2-NEXT:    [[TMP18:%.*]] = load i64, ptr [[DOTOMP_TEMP_121]], align 8
+// CHECK2-NEXT:    br label %[[COND_END26:.*]]
+// CHECK2:       [[COND_FALSE25]]:
+// CHECK2-NEXT:    [[TMP19:%.*]] = load i64, ptr [[DOTOMP_NI118]], align 8
+// CHECK2-NEXT:    br label %[[COND_END26]]
+// CHECK2:       [[COND_END26]]:
+// CHECK2-NEXT:    [[COND27:%.*]] = phi i64 [ [[TMP18]], %[[COND_TRUE24]] ], [ [[TMP19]], %[[COND_FALSE25]] ]
+// CHECK2-NEXT:    store i64 [[COND27]], ptr [[DOTOMP_FUSE_MAX22]], align 8
+// CHECK2-NEXT:    store i32 0, ptr [[I]], align 4
+// CHECK2-NEXT:    br label %[[FOR_COND:.*]]
+// CHECK2:       [[FOR_COND]]:
+// CHECK2-NEXT:    [[TMP20:%.*]] = load i32, ptr [[I]], align 4
+// CHECK2-NEXT:    [[CMP28:%.*]] = icmp slt i32 [[TMP20]], 128
+// CHECK2-NEXT:    br i1 [[CMP28]], label %[[FOR_BODY:.*]], label %[[FOR_END:.*]]
+// CHECK2:       [[FOR_BODY]]:
+// CHECK2-NEXT:    [[TMP21:%.*]] = load i32, ptr [[I]], align 4
+// CHECK2-NEXT:    call void (...) @body(i32 noundef [[TMP21]])
+// CHECK2-NEXT:    br label %[[FOR_INC:.*]]
+// CHECK2:       [[FOR_INC]]:
+// CHECK2-NEXT:    [[TMP22:%.*]] = load i32, ptr [[I]], align 4
+// CHECK2-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP22]], 1
+// CHECK2-NEXT:    store i32 [[INC]], ptr [[I]], align 4
+// CHECK2-NEXT:    br label %[[FOR_COND]], !llvm.loop [[LOOP8:![0-9]+]]
+// CHECK2:       [[FOR_END]]:
+// CHECK2-NEXT:    store i64 0, ptr [[DOTOMP_FUSE_INDEX29]], align 8
+// CHECK2-NEXT:    br label %[[FOR_COND30:.*]]
+// CHECK2:       [[FOR_COND30]]:
+// CHECK2-NEXT:    [[TMP23:%.*]] = load i64, ptr [[DOTOMP_FUSE_INDEX29]], align 8
+// CHECK2-NEXT:    [[TMP24:%.*]] = load i64, ptr [[DOTOMP_FUSE_MAX22]], align 8
+// CHECK2-NEXT:    [[CMP31:%.*]] = icmp slt i64 [[TMP23]], [[TMP24]]
+// CHECK2-NEXT:    br i1 [[CMP31]], label %[[FOR_BODY32:.*]], label %[[FOR_END63:.*]]
+// CHECK2:       [[FOR_BODY32]]:
+// CHECK2-NEXT:    [[TMP25:%.*]] = load i64, ptr [[DOTOMP_FUSE_INDEX29]], align 8
+// CHECK2-NEXT:    [[TMP26:%.*]] = load i64, ptr [[DOTOMP_NI05]], align 8
+// CHECK2-NEXT:    [[CMP33:%.*]] = icmp slt i64 [[TMP25]], [[TMP26]]
+// CHECK2-NEXT:    br i1 [[CMP33]], label %[[IF_THEN:.*]], label %[[IF_END53:.*]]
+// CHECK2:       [[IF_THEN]]:
+// CHECK2-NEXT:    [[TMP27:%.*]] = load i32, ptr [[DOTOMP_LB03]], align 4
+// CHECK2-NEXT:    [[CONV34:%.*]] = sext i32 [[TMP27]] to i64
+// CHECK2-NEXT:    [[TMP28:%.*]] = load i32, ptr [[DOTOMP_ST04]], align 4
+// CHECK2-NEXT:    [[CONV35:%.*]] = sext i32 [[TMP28]] to i64
+// CHECK2-NEXT:    [[TMP29:%.*]] = load i64, ptr [[DOTOMP_FUSE_INDEX29]], align 8
+// CHECK2-NEXT:    [[MUL:%.*]] = mul nsw i64 [[CONV35]], [[TMP29]]
+// CHECK2-NEXT:    [[ADD36:%.*]] = add nsw i64 [[CONV34]], [[MUL]]
+// CHECK2-NEXT:    [[CONV37:%.*]] = trunc i64 [[ADD36]] to i32
+// CHECK2-NEXT:    store i32 [[CONV37]], ptr [[DOTOMP_IV06]], align 4
+// CHECK2-NEXT:    [[TMP30:%.*]] = load i32, ptr [[DOTOMP_IV06]], align 4
+// CHECK2-NEXT:    [[MUL38:%.*]] = mul nsw i32 [[TMP30]], 1
+// CHECK2-NEXT:    [[ADD39:%.*]] = add nsw i32 0, [[MUL38]]
+// CHECK2-NEXT:    store i32 [[ADD39]], ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK2-NEXT:    [[TMP31:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK2-NEXT:    [[TMP32:%.*]] = load i32, ptr [[DOTOMP_NI0]], align 4
+// CHECK2-NEXT:    [[CMP40:%.*]] = icmp slt i32 [[TMP31]], [[TMP32]]
+// CHECK2-NEXT:    br i1 [[CMP40]], label %[[IF_THEN41:.*]], label %[[IF_END:.*]]
+// CHECK2:       [[IF_THEN41]]:
+// CHECK2-NEXT:    [[TMP33:%.*]] = load i32, ptr [[DOTOMP_LB0]], align 4
+// CHECK2-NEXT:    [[TMP34:%.*]] = load i32, ptr [[DOTOMP_ST0]], align 4
+// CHECK2-NEXT:    [[TMP35:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK2-NEXT:    [[MUL42:%.*]] = mul nsw i32 [[TMP34]], [[TMP35]]
+// CHECK2-NEXT:    [[ADD43:%.*]] = add nsw i32 [[TMP33]], [[MUL42]]
+// CHECK2-NEXT:    store i32 [[ADD43]], ptr [[DOTOMP_IV0]], align 4
+// CHECK2-NEXT:    [[TMP36:%.*]] = load i32, ptr [[DOTOMP_IV0]], align 4
+// CHECK2-NEXT:    [[MUL44:%.*]] = mul nsw i32 [[TMP36]], 2
+// CHECK2-NEXT:    [[ADD45:%.*]] = add nsw i32 0, [[MUL44]]
+// CHECK2-NEXT:    store i32 [[ADD45]], ptr [[J]], align 4
+// CHECK2-NEXT:    [[TMP37:%.*]] = load i32, ptr [[J]], align 4
+// CHECK2-NEXT:    call void (...) @body(i32 noundef [[TMP37]])
+// CHECK2-NEXT:    br label %[[IF_END]]
+// CHECK2:       [[IF_END]]:
+// CHECK2-NEXT:    [[TMP38:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK2-NEXT:    [[TMP39:%.*]] = load i32, ptr [[DOTOMP_NI1]], align 4
+// CHECK2-NEXT:    [[CMP46:%.*]] = icmp slt i32 [[TMP38]], [[TMP39]]
+// CHECK2-NEXT:    br i1 [[CMP46]], label %[[IF_THEN47:.*]], label %[[IF_END52:.*]]
+// CHECK2:       [[IF_THEN47]]:
+// CHECK2-NEXT:    [[TMP40:%.*]] = load i32, ptr [[DOTOMP_LB1]], align 4
+// CHECK2-NEXT:    [[TMP41:%.*]] = load i32, ptr [[DOTOMP_ST1]], align 4
+// CHECK2-NEXT:    [[TMP42:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK2-NEXT:    [[MUL48:%.*]] = mul nsw i32 [[TMP41]], [[TMP42]]
+// CHECK2-NEXT:    [[ADD49:%.*]] = add nsw i32 [[TMP40]], [[MUL48]]
+// CHECK2-NEXT:    store i32 [[ADD49]], ptr [[DOTOMP_IV1]], align 4
+// CHECK2-NEXT:    [[TMP43:%.*]] = load i32, ptr [[DOTOMP_IV1]], align 4
+// CHECK2-NEXT:    [[MUL50:%.*]] = mul nsw i32 [[TMP43]], 1
+// CHECK2-NEXT:    [[ADD51:%.*]] = add nsw i32 0, [[MUL50]]
+// CHECK2-NEXT:    store i32 [[ADD51]], ptr [[K]], align 4
+// CHECK2-NEXT:    [[TMP44:%.*]] = load i32, ptr [[K]], align 4
+// CHECK2-NEXT:    call void (...) @body(i32 noundef [[TMP44]])
+// CHECK2-NEXT:    br label %[[IF_END52]]
+// CHECK2:       [[IF_END52]]:
+// CHECK2-NEXT:    br label %[[IF_END53]]
+// CHECK2:       [[IF_END53]]:
+// CHECK2-NEXT:    [[TMP45:%.*]] = load i64, ptr [[DOTOMP_FUSE_INDEX29]], align 8
+// CHECK2-NEXT:    [[TMP46:%.*]] = load i64, ptr [[DOTOMP_NI118]], align 8
+// CHECK2-NEXT:    [[CMP54:%.*]] = icmp slt i64 [[TMP45]], [[TMP46]]
+// CHECK2-NEXT:    br i1 [[CMP54]], label %[[IF_THEN55:.*]], label %[[IF_END60:.*]]
+// CHECK2:       [[IF_THEN55]]:
+// CHECK2-NEXT:    [[TMP47:%.*]] = load i64, ptr [[DOTOMP_LB116]], align 8
+// CHECK2-NEXT:    [[TMP48:%.*]] = load i64, ptr [[DOTOMP_ST117]], align 8
+// CHECK2-NEXT:    [[TMP49:%.*]] = load i64, ptr [[DOTOMP_FUSE_INDEX29]], align 8
+// CHECK2-NEXT:    [[MUL56:%.*]] = mul nsw i64 [[TMP48]], [[TMP49]]
+// CHECK2-NEXT:    [[ADD57:%.*]] = add nsw i64 [[TMP47]], [[MUL56]]
+// CHECK2-NEXT:    store i64 [[ADD57]], ptr [[DOTOMP_IV120]], align 8
+// CHECK2-NEXT:    [[TMP50:%.*]] = load ptr, ptr [[DOTCAPTURE_EXPR_8]], align 8
+// CHECK2-NEXT:    [[TMP51:%.*]] = load i64, ptr [[DOTOMP_IV120]], align 8
+// CHECK2-NEXT:    [[MUL58:%.*]] = mul nsw i64 [[TMP51]], 1
+// CHECK2-NEXT:    [[ADD_PTR59:%.*]] = getelementptr inbounds double, ptr [[TMP50]], i64 [[MUL58]]
+// CHECK2-NEXT:    store ptr [[ADD_PTR59]], ptr [[__BEGIN2]], align 8
+// CHECK2-NEXT:    [[TMP52:%.*]] = load ptr, ptr [[__BEGIN2]], align 8
+// CHECK2-NEXT:    store ptr [[TMP52]], ptr [[V]], align 8
+// CHECK2-NEXT:    [[TMP53:%.*]] = load i32, ptr [[C]], align 4
+// CHECK2-NEXT:    [[TMP54:%.*]] = load ptr, ptr [[V]], align 8
+// CHECK2-NEXT:    [[TMP55:%.*]] = load double, ptr [[TMP54]], align 8
+// CHECK2-NEXT:    call void (...) @body(i32 noundef [[TMP53]], double noundef [[TMP55]])
+// CHECK2-NEXT:    br label %[[IF_END60]]
+// CHECK2:       [[IF_END60]]:
+// CHECK2-NEXT:    br label %[[FOR_INC61:.*]]
+// CHECK2:       [[FOR_INC61]]:
+// CHECK2-NEXT:    [[TMP56:%.*]] = load i64, ptr [[DOTOMP_FUSE_INDEX29]], align 8
+// CHECK2-NEXT:    [[INC62:%.*]] = add nsw i64 [[TMP56]], 1
+// CHECK2-NEXT:    store i64 [[INC62]], ptr [[DOTOMP_FUSE_INDEX29]], align 8
+// CHECK2-NEXT:    br label %[[FOR_COND30]], !llvm.loop [[LOOP9:![0-9]+]]
+// CHECK2:       [[FOR_END63]]:
+// CHECK2-NEXT:    store i32 37, ptr [[CC]], align 4
+// CHECK2-NEXT:    store ptr [[ARR]], ptr [[__RANGE264]], align 8
+// CHECK2-NEXT:    [[TMP57:%.*]] = load ptr, ptr [[__RANGE264]], align 8
+// CHECK2-NEXT:    [[ARRAYDECAY66:%.*]] = getelementptr inbounds [256 x double], ptr [[TMP57]], i64 0, i64 0
+// CHECK2-NEXT:    store ptr [[ARRAYDECAY66]], ptr [[__BEGIN265]], align 8
+// CHECK2-NEXT:    [[TMP58:%.*]] = load ptr, ptr [[__RANGE264]], align 8
+// CHECK2-NEXT:    [[ARRAYDECAY68:%.*]] = getelementptr inbounds [256 x double], ptr [[TMP58]], i64 0, i64 0
+// CHECK2-NEXT:    [[ADD_PTR69:%.*]] = getelementptr inbounds double, ptr [[ARRAYDECAY68]], i64 256
+// CHECK2-NEXT:    store ptr [[ADD_PTR69]], ptr [[__END267]], align 8
+// CHECK2-NEXT:    br label %[[FOR_COND70:.*]]
+// CHECK2:       [[FOR_COND70]]:
+// CHECK2-NEXT:    [[TMP59:%.*]] = load ptr, ptr [[__BEGIN265]], align 8
+// CHECK2-NEXT:    [[TMP60:%.*]] = load ptr, ptr [[__END267]], align 8
+// CHECK2-NEXT:    [[CMP71:%.*]] = icmp ne ptr [[TMP59]], [[TMP60]]
+// CHECK2-NEXT:    br i1 [[CMP71]], label %[[FOR_BODY72:.*]], label %[[FOR_END74:.*]]
+// CHECK2:       [[FOR_BODY72]]:
+// CHECK2-NEXT:    [[TMP61:%.*]] = load ptr, ptr [[__BEGIN265]], align 8
+// CHECK2-NEXT:    store ptr [[TMP61]], ptr [[VV]], align 8
+// CHECK2-NEXT:    [[TMP62:%.*]] = load i32, ptr [[CC]], align 4
+// CHECK2-NEXT:    [[TMP63:%.*]] = load ptr, ptr [[VV]], align 8
+// CHECK2-NEXT:    [[TMP64:%.*]] = load double, ptr [[TMP63]], align 8
+// CHECK2-NEXT:    call void (...) @body(i32 noundef [[TMP62]], double noundef [[TMP64]])
+// CHECK2-NEXT:    br label %[[FOR_INC73:.*]]
+// CHECK2:       [[FOR_INC73]]:
+// CHECK2-NEXT:    [[TMP65:%.*]] = load ptr, ptr [[__BEGIN265]], align 8
+// CHECK2-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw double, ptr [[TMP65]], i32 1
+// CHECK2-NEXT:    store ptr [[INCDEC_PTR]], ptr [[__BEGIN265]], align 8
+// CHECK2-NEXT:    br label %[[FOR_COND70]]
+// CHECK2:       [[FOR_END74]]:
+// CHECK2-NEXT:    ret void
+//
+//
+// CHECK2-LABEL: define dso_local void @tfoo2(
+// CHECK2-SAME: ) #[[ATTR0]] {
+// CHECK2-NEXT:  [[ENTRY:.*:]]
+// CHECK2-NEXT:    call void @_Z4foo2IiEvT_S0_S0_(i32 noundef 0, i32 noundef 64, i32 noundef 4)
+// CHECK2-NEXT:    ret void
+//
+//
+// CHECK2-LABEL: define linkonce_odr void @_Z4foo2IiEvT_S0_S0_(
+// CHECK2-SAME: i32 noundef [[START:%.*]], i32 noundef [[END:%.*]], i32 noundef [[STEP:%.*]]) #[[ATTR0]] comdat {
+// CHECK2-NEXT:  [[ENTRY:.*:]]
+// CHECK2-NEXT:    [[START_ADDR:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[END_ADDR:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[STEP_ADDR:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[J:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[K:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTNEW_STEP:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_LB0:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_ST0:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_NI0:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_IV0:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTCAPTURE_EXPR_6:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTCAPTURE_EXPR_7:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTNEW_STEP8:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTCAPTURE_EXPR_9:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_LB1:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_ST1:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_NI1:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_IV1:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTCAPTURE_EXPR_17:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTCAPTURE_EXPR_19:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTNEW_STEP21:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTCAPTURE_EXPR_22:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_LB2:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_ST2:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_NI2:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_IV2:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_TEMP_1:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_TEMP_2:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_FUSE_MAX:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_FUSE_INDEX:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    store i32 [[START]], ptr [[START_ADDR]], align 4
+// CHECK2-NEXT:    store i32 [[END]], ptr [[END_ADDR]], align 4
+// CHECK2-NEXT:    store i32 [[STEP]], ptr [[STEP_ADDR]], align 4
+// CHECK2-NEXT:    [[TMP0:%.*]] = load i32, ptr [[START_ADDR]], align 4
+// CHECK2-NEXT:    store i32 [[TMP0]], ptr [[I]], align 4
+// CHECK2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[START_ADDR]], align 4
+// CHECK2-NEXT:    store i32 [[TMP1]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[END_ADDR]], align 4
+// CHECK2-NEXT:    store i32 [[TMP2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK2-NEXT:    [[TMP3:%.*]] = load i32, ptr [[STEP_ADDR]], align 4
+// CHECK2-NEXT:    store i32 [[TMP3]], ptr [[DOTNEW_STEP]], align 4
+// CHECK2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK2-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK2-NEXT:    [[SUB:%.*]] = sub i32 [[TMP4]], [[TMP5]]
+// CHECK2-NEXT:    [[SUB3:%.*]] = sub i32 [[SUB]], 1
+// CHECK2-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTNEW_STEP]], align 4
+// CHECK2-NEXT:    [[ADD:%.*]] = add i32 [[SUB3]], [[TMP6]]
+// CHECK2-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTNEW_STEP]], align 4
+// CHECK2-NEXT:    [[DIV:%.*]] = udiv i32 [[ADD]], [[TMP7]]
+// CHECK2-NEXT:    [[SUB4:%.*]] = sub i32 [[DIV]], 1
+// CHECK2-NEXT:    store i32 [[SUB4]], ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK2-NEXT:    store i32 0, ptr [[DOTOMP_LB0]], align 4
+// CHECK2-NEXT:    store i32 1, ptr [[DOTOMP_ST0]], align 4
+// CHECK2-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK2-NEXT:    [[ADD5:%.*]] = add i32 [[TMP8]], 1
+// CHECK2-NEXT:    store i32 [[ADD5]], ptr [[DOTOMP_NI0]], align 4
+// CHECK2-NEXT:    [[TMP9:%.*]] = load i32, ptr [[END_ADDR]], align 4
+// CHECK2-NEXT:    store i32 [[TMP9]], ptr [[J]], align 4
+// CHECK2-NEXT:    [[TMP10:%.*]] = load i32, ptr [[END_ADDR]], align 4
+// CHECK2-NEXT:    store i32 [[TMP10]], ptr [[DOTCAPTURE_EXPR_6]], align 4
+// CHECK2-NEXT:    [[TMP11:%.*]] = load i32, ptr [[START_ADDR]], align 4
+// CHECK2-NEXT:    store i32 [[TMP11]], ptr [[DOTCAPTURE_EXPR_7]], align 4
+// CHECK2-NEXT:    [[TMP12:%.*]] = load i32, ptr [[STEP_ADDR]], align 4
+// CHECK2-NEXT:    store i32 [[TMP12]], ptr [[DOTNEW_STEP8]], align 4
+// CHECK2-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_6]], align 4
+// CHECK2-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_7]], align 4
+// CHECK2-NEXT:    [[SUB10:%.*]] = sub i32 [[TMP13]], [[TMP14]]
+// CHECK2-NEXT:    [[SUB11:%.*]] = sub i32 [[SUB10]], 1
+// CHECK2-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTNEW_STEP8]], align 4
+// CHECK2-NEXT:    [[ADD12:%.*]] = add i32 [[SUB11]], [[TMP15]]
+// CHECK2-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTNEW_STEP8]], align 4
+// CHECK2-NEXT:    [[DIV13:%.*]] = udiv i32 [[ADD12]], [[TMP16]]
+// CHECK2-NEXT:    [[SUB14:%.*]] = sub i32 [[DIV13]], 1
+// CHECK2-NEXT:    store i32 [[SUB14]], ptr [[DOTCAPTURE_EXPR_9]], align 4
+// CHECK2-NEXT:    store i32 0, ptr [[DOTOMP_LB1]], align 4
+// CHECK2-NEXT:    store i32 1, ptr [[DOTOMP_ST1]], align 4
+// CHECK2-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_9]], align 4
+// CHECK2-NEXT:    [[ADD15:%.*]] = add i32 [[TMP17]], 1
+// CHECK2-NEXT:    store i32 [[ADD15]], ptr [[DOTOMP_NI1]], align 4
+// CHECK2-NEXT:    [[TMP18:%.*]] = load i32, ptr [[START_ADDR]], align 4
+// CHECK2-NEXT:    [[TMP19:%.*]] = load i32, ptr [[STEP_ADDR]], align 4
+// CHECK2-NEXT:    [[ADD16:%.*]] = add nsw i32 [[TMP18]], [[TMP19]]
+// CHECK2-NEXT:    store i32 [[ADD16]], ptr [[K]], align 4
+// CHECK2-NEXT:    [[TMP20:%.*]] = load i32, ptr [[START_ADDR]], align 4
+// CHECK2-NEXT:    [[TMP21:%.*]] = load i32, ptr [[STEP_ADDR]], align 4
+// CHECK2-NEXT:    [[ADD18:%.*]] = add nsw i32 [[TMP20]], [[TMP21]]
+// CHECK2-NEXT:    store i32 [[ADD18]], ptr [[DOTCAPTURE_EXPR_17]], align 4
+// CHECK2-NEXT:    [[TMP22:%.*]] = load i32, ptr [[END_ADDR]], align 4
+// CHECK2-NEXT:    [[TMP23:%.*]] = load i32, ptr [[STEP_ADDR]], align 4
+// CHECK2-NEXT:    [[ADD20:%.*]] = add nsw i32 [[TMP22]], [[TMP23]]
+// CHECK2-NEXT:    store i32 [[ADD20]], ptr [[DOTCAPTURE_EXPR_19]], align 4
+// CHECK2-NEXT:    [[TMP24:%.*]] = load i32, ptr [[STEP_ADDR]], align 4
+// CHECK2-NEXT:    store i32 [[TMP24]], ptr [[DOTNEW_STEP21]], align 4
+// CHECK2-NEXT:    [[TMP25:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_19]], align 4
+// CHECK2-NEXT:    [[TMP26:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_17]], align 4
+// CHECK2-NEXT:    [[SUB23:%.*]] = sub i32 [[TMP25]], [[TMP26]]
+// CHECK2-NEXT:    [[SUB24:%.*]] = sub i32 [[SUB23]], 1
+// CHECK2-NEXT:    [[TMP27:%.*]] = load i32, ptr [[DOTNEW_STEP21]], align 4
+// CHECK2-NEXT:    [[ADD25:%.*]] = add i32 [[SUB24]], [[TMP27]]
+// CHECK2-NEXT:    [[TMP28:%.*]] = load i32, ptr [[DOTNEW_STEP21]], align 4
+// CHECK2-NEXT:    [[DIV26:%.*]] = udiv i32 [[ADD25]], [[TMP28]]
+// CHECK2-NEXT:    [[SUB27:%.*]] = sub i32 [[DIV26]], 1
+// CHECK2-NEXT:    store i32 [[SUB27]], ptr [[DOTCAPTURE_EXPR_22]], align 4
+// CHECK2-NEXT:    store i32 0, ptr [[DOTOMP_LB2]], align 4
+// CHECK2-NEXT:    store i32 1, ptr [[DOTOMP_ST2]], align 4
+// CHECK2-NEXT:    [[TMP29:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_22]], align 4
+// CHECK2-NEXT:    [[ADD28:%.*]] = add i32 [[TMP29]], 1
+// CHECK2-NEXT:    store i32 [[ADD28]], ptr [[DOTOMP_NI2]], align 4
+// CHECK2-NEXT:    [[TMP30:%.*]] = load i32, ptr [[DOTOMP_NI0]], align 4
+// CHECK2-NEXT:    store i32 [[TMP30]], ptr [[DOTOMP_TEMP_1]], align 4
+// CHECK2-NEXT:    [[TMP31:%.*]] = load i32, ptr [[DOTOMP_TEMP_1]], align 4
+// CHECK2-NEXT:    [[TMP32:%.*]] = load i32, ptr [[DOTOMP_NI1]], align 4
+// CHECK2-NEXT:    [[CMP:%.*]] = icmp ugt i32 [[TMP31]], [[TMP32]]
+// CHECK2-NEXT:    br i1 [[CMP]], label %[[COND_TRUE:.*]], label %[[COND_FALSE:.*]]
+// CHECK2:       [[COND_TRUE]]:
+// CHECK2-NEXT:    [[TMP33:%.*]] = load i32, ptr [[DOTOMP_TEMP_1]], align 4
+// CHECK2-NEXT:    br label %[[COND_END:.*]]
+// CHECK2:       [[COND_FALSE]]:
+// CHECK2-NEXT:    [[TMP34:%.*]] = load i32, ptr [[DOTOMP_NI1]], align 4
+// CHECK2-NEXT:    br label %[[COND_END]]
+// CHECK2:       [[COND_END]]:
+// CHECK2-NEXT:    [[COND:%.*]] = phi i32 [ [[TMP33]], %[[COND_TRUE]] ], [ [[TMP34]], %[[COND_FALSE]] ]
+// CHECK2-NEXT:    store i32 [[COND]], ptr [[DOTOMP_TEMP_2]], align 4
+// CHECK2-NEXT:    [[TMP35:%.*]] = load i32, ptr [[DOTOMP_TEMP_2]], align 4
+// CHECK2-NEXT:    [[TMP36:%.*]] = load i32, ptr [[DOTOMP_NI2]], align 4
+// CHECK2-NEXT:    [[CMP29:%.*]] = icmp ugt i32 [[TMP35]], [[TMP36]]
+// CHECK2-NEXT:    br i1 [[CMP29]], label %[[COND_TRUE30:.*]], label %[[COND_FALSE31:.*]]
+// CHECK2:       [[COND_TRUE30]]:
+// CHECK2-NEXT:    [[TMP37:%.*]] = load i32, ptr [[DOTOMP_TEMP_2]], align 4
+// CHECK2-NEXT:    br label %[[COND_END32:.*]]
+// CHECK2:       [[COND_FALSE31]]:
+// CHECK2-NEXT:    [[TMP38:%.*]] = load i32, ptr [[DOTOMP_NI2]], align 4
+// CHECK2-NEXT:    br label %[[COND_END32]]
+// CHECK2:       [[COND_END32]]:
+// CHECK2-NEXT:    [[COND33:%.*]] = phi i32 [ [[TMP37]], %[[COND_TRUE30]] ], [ [[TMP38]], %[[COND_FALSE31]] ]
+// CHECK2-NEXT:    store i32 [[COND33]], ptr [[DOTOMP_FUSE_MAX]], align 4
+// CHECK2-NEXT:    store i32 0, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK2-NEXT:    br label %[[FOR_COND:.*]]
+// CHECK2:       [[FOR_COND]]:
+// CHECK2-NEXT:    [[TMP39:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK2-NEXT:    [[TMP40:%.*]] = load i32, ptr [[DOTOMP_FUSE_MAX]], align 4
+// CHECK2-NEXT:    [[CMP34:%.*]] = icmp ult i32 [[TMP39]], [[TMP40]]
+// CHECK2-NEXT:    br i1 [[CMP34]], label %[[FOR_BODY:.*]], label %[[FOR_END:.*]]
+// CHECK2:       [[FOR_BODY]]:
+// CHECK2-NEXT:    [[TMP41:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK2-NEXT:    [[TMP42:%.*]] = load i32, ptr [[DOTOMP_NI0]], align 4
+// CHECK2-NEXT:    [[CMP35:%.*]] = icmp ult i32 [[TMP41]], [[TMP42]]
+// CHECK2-NEXT:    br i1 [[CMP35]], label %[[IF_THEN:.*]], label %[[IF_END:.*]]
+// CHECK2:       [[IF_THEN]]:
+// CHECK2-NEXT:    [[TMP43:%.*]] = load i32, ptr [[DOTOMP_LB0]], align 4
+// CHECK2-NEXT:    [[TMP44:%.*]] = load i32, ptr [[DOTOMP_ST0]], align 4
+// CHECK2-NEXT:    [[TMP45:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK2-NEXT:    [[MUL:%.*]] = mul i32 [[TMP44]], [[TMP45]]
+// CHECK2-NEXT:    [[ADD36:%.*]] = add i32 [[TMP43]], [[MUL]]
+// CHECK2-NEXT:    store i32 [[ADD36]], ptr [[DOTOMP_IV0]], align 4
+// CHECK2-NEXT:    [[TMP46:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK2-NEXT:    [[TMP47:%.*]] = load i32, ptr [[DOTOMP_IV0]], align 4
+// CHECK2-NEXT:    [[TMP48:%.*]] = load i32, ptr [[DOTNEW_STEP]], align 4
+// CHECK2-NEXT:    [[MUL37:%.*]] = mul i32 [[TMP47]], [[TMP48]]
+// CHECK2-NEXT:    [[ADD38:%.*]] = add i32 [[TMP46]], [[MUL37]]
+// CHECK2-NEXT:    store i32 [[ADD38]], ptr [[I]], align 4
+// CHECK2-NEXT:    [[TMP49:%.*]] = load i32, ptr [[I]], align 4
+// CHECK2-NEXT:    call void (...) @body(i32 noundef [[TMP49]])
+// CHECK2-NEXT:    br label %[[IF_END]]
+// CHECK2:       [[IF_END]]:
+// CHECK2-NEXT:    [[TMP50:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK2-NEXT:    [[TMP51:%.*]] = load i32, ptr [[DOTOMP_NI1]], align 4
+// CHECK2-NEXT:    [[CMP39:%.*]] = icmp ult i32 [[TMP50]], [[TMP51]]
+// CHECK2-NEXT:    br i1 [[CMP39]], label %[[IF_THEN40:.*]], label %[[IF_END45:.*]]
+// CHECK2:       [[IF_THEN40]]:
+// CHECK2-NEXT:    [[TMP52:%.*]] = load i32, ptr [[DOTOMP_LB1]], align 4
+// CHECK2-NEXT:    [[TMP53:%.*]] = load i32, ptr [[DOTOMP_ST1]], align 4
+// CHECK2-NEXT:    [[TMP54:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK2-NEXT:    [[MUL41:%.*]] = mul i32 [[TMP53]], [[TMP54]]
+// CHECK2-NEXT:    [[ADD42:%.*]] = add i32 [[TMP52]], [[MUL41]]
+// CHECK2-NEXT:    store i32 [[ADD42]], ptr [[DOTOMP_IV1]], align 4
+// CHECK2-NEXT:    [[TMP55:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_6]], align 4
+// CHECK2-NEXT:    [[TMP56:%.*]] = load i32, ptr [[DOTOMP_IV1]], align 4
+// CHECK2-NEXT:    [[TMP57:%.*]] = load i32, ptr [[DOTNEW_STEP8]], align 4
+// CHECK2-NEXT:    [[MUL43:%.*]] = mul i32 [[TMP56]], [[TMP57]]
+// CHECK2-NEXT:    [[SUB44:%.*]] = sub i32 [[TMP55]], [[MUL43]]
+// CHECK2-NEXT:    store i32 [[SUB44]], ptr [[J]], align 4
+// CHECK2-NEXT:    [[TMP58:%.*]] = load i32, ptr [[J]], align 4
+// CHECK2-NEXT:    call void (...) @body(i32 noundef [[TMP58]])
+// CHECK2-NEXT:    br label %[[IF_END45]]
+// CHECK2:       [[IF_END45]]:
+// CHECK2-NEXT:    [[TMP59:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK2-NEXT:    [[TMP60:%.*]] = load i32, ptr [[DOTOMP_NI2]], align 4
+// CHECK2-NEXT:    [[CMP46:%.*]] = icmp ult i32 [[TMP59]], [[TMP60]]
+// CHECK2-NEXT:    br i1 [[CMP46]], label %[[IF_THEN47:.*]], label %[[IF_END52:.*]]
+// CHECK2:       [[IF_THEN47]]:
+// CHECK2-NEXT:    [[TMP61:%.*]] = load i32, ptr [[DOTOMP_LB2]], align 4
+// CHECK2-NEXT:    [[TMP62:%.*]] = load i32, ptr [[DOTOMP_ST2]], align 4
+// CHECK2-NEXT:    [[TMP63:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK2-NEXT:    [[MUL48:%.*]] = mul i32 [[TMP62]], [[TMP63]]
+// CHECK2-NEXT:    [[ADD49:%.*]] = add i32 [[TMP61]], [[MUL48]]
+// CHECK2-NEXT:    store i32 [[ADD49]], ptr [[DOTOMP_IV2]], align 4
+// CHECK2-NEXT:    [[TMP64:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_17]], align 4
+// CHECK2-NEXT:    [[TMP65:%.*]] = load i32, ptr [[DOTOMP_IV2]], align 4
+// CHECK2-NEXT:    [[TMP66:%.*]] = load i32, ptr [[DOTNEW_STEP21]], align 4
+// CHECK2-NEXT:    [[MUL50:%.*]] = mul i32 [[TMP65]], [[TMP66]]
+// CHECK2-NEXT:    [[ADD51:%.*]] = add i32 [[TMP64]], [[MUL50]]
+// CHECK2-NEXT:    store i32 [[ADD51]], ptr [[K]], align 4
+// CHECK2-NEXT:    [[TMP67:%.*]] = load i32, ptr [[K]], align 4
+// CHECK2-NEXT:    call void (...) @body(i32 noundef [[TMP67]])
+// CHECK2-NEXT:    br label %[[IF_END52]]
+// CHECK2:       [[IF_END52]]:
+// CHECK2-NEXT:    br label %[[FOR_INC:.*]]
+// CHECK2:       [[FOR_INC]]:
+// CHECK2-NEXT:    [[TMP68:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK2-NEXT:    [[INC:%.*]] = add i32 [[TMP68]], 1
+// CHECK2-NEXT:    store i32 [[INC]], ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK2-NEXT:    br label %[[FOR_COND]], !llvm.loop [[LOOP10:![0-9]+]]
+// CHECK2:       [[FOR_END]]:
+// CHECK2-NEXT:    ret void
+//
+//.
+// CHECK1: [[LOOP3]] = distinct !{[[LOOP3]], [[META4:![0-9]+]]}
+// CHECK1: [[META4]] = !{!"llvm.loop.mustprogress"}
+// CHECK1: [[LOOP5]] = distinct !{[[LOOP5]], [[META4]]}
+// CHECK1: [[LOOP6]] = distinct !{[[LOOP6]], [[META4]]}
+// CHECK1: [[LOOP7]] = distinct !{[[LOOP7]], [[META4]]}
+// CHECK1: [[LOOP8]] = distinct !{[[LOOP8]], [[META4]]}
+// CHECK1: [[LOOP9]] = distinct !{[[LOOP9]], [[META4]]}
+// CHECK1: [[LOOP10]] = distinct !{[[LOOP10]], [[META4]]}
+//.
+// CHECK2: [[LOOP3]] = distinct !{[[LOOP3]], [[META4:![0-9]+]]}
+// CHECK2: [[META4]] = !{!"llvm.loop.mustprogress"}
+// CHECK2: [[LOOP5]] = distinct !{[[LOOP5]], [[META4]]}
+// CHECK2: [[LOOP6]] = distinct !{[[LOOP6]], [[META4]]}
+// CHECK2: [[LOOP7]] = distinct !{[[LOOP7]], [[META4]]}
+// CHECK2: [[LOOP8]] = distinct !{[[LOOP8]], [[META4]]}
+// CHECK2: [[LOOP9]] = distinct !{[[LOOP9]], [[META4]]}
+// CHECK2: [[LOOP10]] = distinct !{[[LOOP10]], [[META4]]}
+//.
diff --git a/clang/test/OpenMP/fuse_messages.cpp b/clang/test/OpenMP/fuse_messages.cpp
new file mode 100644
index 0000000..b86ce95
--- /dev/null
+++ b/clang/test/OpenMP/fuse_messages.cpp
@@ -0,0 +1,209 @@
+// RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -std=c++20 -fopenmp -fopenmp-version=60 -fsyntax-only -Wuninitialized -verify %s
+
+void func() {
+
+    // expected-error@+2 {{statement after '#pragma omp fuse' must be a loop sequence containing canonical loops or loop-generating constructs}}
+    #pragma omp fuse
+    ;
+
+    // expected-error@+2 {{statement after '#pragma omp fuse' must be a for loop}}
+    #pragma omp fuse
+    {int bar = 0;}
+
+    // expected-error@+4 {{statement after '#pragma omp fuse' must be a for loop}}
+    #pragma omp fuse
+    {
+        for(int i = 0; i < 10; ++i);
+        int x = 2;
+    }
+
+    // expected-error@+2 {{statement after '#pragma omp fuse' must be a loop sequence containing canonical loops or loop-generating constructs}}
+    #pragma omp fuse
+    #pragma omp for
+    for (int i = 0; i < 7; ++i)
+        ;
+
+    {
+        // expected-error@+2 {{expected statement}}
+        #pragma omp fuse
+    }
+
+    // expected-warning@+1 {{extra tokens at the end of '#pragma omp fuse' are ignored}}
+    #pragma omp fuse foo
+    {
+        for (int i = 0; i < 7; ++i)
+            ;
+        for(int j = 0; j < 100; ++j);
+
+    }
+
+
+    // expected-error@+1 {{unexpected OpenMP clause 'final' in directive '#pragma omp fuse'}}
+    #pragma omp fuse final(0)
+    {
+        for (int i = 0; i < 7; ++i)
+            ;
+        for(int j = 0; j < 100; ++j);
+
+    }
+
+    //expected-error@+3 {{increment clause of OpenMP for loop must perform simple addition or subtraction on loop variable 'i'}}
+    #pragma omp fuse
+    {
+        for(int i = 0; i < 10; i*=2) {
+            ;
+        }
+        for(int j = 0; j < 100; ++j);
+    }
+
+    //expected-error@+2 {{loop sequence after '#pragma omp fuse' must contain at least 1 canonical loop or loop-generating construct}}
+    #pragma omp fuse
+    {}
+
+    //expected-error@+3 {{statement after '#pragma omp fuse' must be a for loop}}
+    #pragma omp fuse
+    {
+        #pragma omp unroll full
+        for(int i = 0; i < 10; ++i);
+
+        for(int j = 0; j < 10; ++j);
+    }
+
+    //expected-warning@+2 {{looprange clause selects a single loop, resulting in redundant fusion}}
+    #pragma omp fuse
+    {
+        for(int i = 0; i < 10; ++i);
+    }
+
+    //expected-warning@+1 {{looprange clause selects a single loop, resulting in redundant fusion}}
+    #pragma omp fuse looprange(1, 1)
+    {
+        for(int i = 0; i < 10; ++i);
+        for(int j = 0; j < 100; ++j);
+    }
+
+    //expected-error@+1 {{argument to 'looprange' clause must be a strictly positive integer value}}
+    #pragma omp fuse looprange(1, -1)
+    {
+        for(int i = 0; i < 10; ++i);
+        for(int j = 0; j < 100; ++j);
+    }
+
+    //expected-error@+1 {{argument to 'looprange' clause must be a strictly positive integer value}}
+    #pragma omp fuse looprange(1, 0)
+    {
+        for(int i = 0; i < 10; ++i);
+        for(int j = 0; j < 100; ++j);
+    }
+
+    const int x = 1;
+    constexpr int y = 4;
+    //expected-error@+1 {{looprange clause selects loops from 1 to 4 but this exceeds the number of loops (3) in the loop sequence}}
+    #pragma omp fuse looprange(x,y)
+    {
+        for(int i = 0; i < 10; ++i);
+        for(int j = 0; j < 100; ++j);
+        for(int k = 0; k < 50; ++k);
+    }
+
+    //expected-error@+1 {{looprange clause selects loops from 1 to 420 but this exceeds the number of loops (3) in the loop sequence}}
+    #pragma omp fuse looprange(1,420)
+    {
+        for(int i = 0; i < 10; ++i);
+        for(int j = 0; j < 100; ++j);
+        for(int k = 0; k < 50; ++k);
+    }
+
+    //expected-error@+1 {{looprange clause selects loops from 1 to 6 but this exceeds the number of loops (5) in the loop sequence}}
+    #pragma omp fuse looprange(1,6)
+    {
+        for(int i = 0; i < 10; ++i);
+        for(int j = 0; j < 100; ++j);
+        for(int k = 0; k < 50; ++k);
+        // This fusion results in 2 loops
+        #pragma omp fuse looprange(1,2)
+        {
+            for(int i = 0; i < 10; ++i);
+            for(int j = 0; j < 100; ++j);
+            for(int k = 0; k < 50; ++k);
+        }
+    }
+
+    //expected-error@+1 {{looprange clause selects loops from 2 to 4 but this exceeds the number of loops (3) in the loop sequence}}
+    #pragma omp fuse looprange(2,3)
+    {
+        #pragma omp unroll partial(2)
+        for(int i = 0; i < 10; ++i);
+
+        #pragma omp reverse
+        for(int j = 0; j < 10; ++j);
+
+        #pragma omp fuse
+        {
+            {
+                #pragma omp reverse
+                for(int j = 0; j < 10; ++j);
+            }
+            for(int k = 0; k < 50; ++k);
+        }
+    }
+}
+
+// In a template context, but expression itself not instantiation-dependent
+template <typename T>
+static void templated_func() {
+
+    //expected-warning@+1 {{looprange clause selects a single loop, resulting in redundant fusion}}
+    #pragma omp fuse looprange(2,1)
+    {
+        for(int i = 0; i < 10; ++i);
+        for(int j = 0; j < 100; ++j);
+        for(int k = 0; k < 50; ++k);
+    }
+
+    //expected-error@+1 {{looprange clause selects loops from 3 to 5 but this exceeds the number of loops (3) in the loop sequence}}
+    #pragma omp fuse looprange(3,3)
+    {
+        for(int i = 0; i < 10; ++i);
+        for(int j = 0; j < 100; ++j);
+        for(int k = 0; k < 50; ++k);
+    }
+
+}
+
+template <int V>
+static void templated_func_value_dependent() {
+
+    //expected-warning@+1 {{looprange clause selects a single loop, resulting in redundant fusion}}
+    #pragma omp fuse looprange(V,1)
+    {
+        for(int i = 0; i < 10; ++i);
+        for(int j = 0; j < 100; ++j);
+        for(int k = 0; k < 50; ++k);
+    }
+}
+
+template <typename T>
+static void templated_func_type_dependent() {
+    constexpr T s = 1;
+
+    //expected-error@+1 {{argument to 'looprange' clause must be a strictly positive integer value}}
+    #pragma omp fuse looprange(s,s-1)
+    {
+        for(int i = 0; i < 10; ++i);
+        for(int j = 0; j < 100; ++j);
+        for(int k = 0; k < 50; ++k);
+    }
+}
+
+
+void template_inst() {
+    // expected-note@+1 {{in instantiation of function template specialization 'templated_func<int>' requested here}}
+    templated_func<int>();
+    // expected-note@+1 {{in instantiation of function template specialization 'templated_func_value_dependent<1>' requested here}}
+    templated_func_value_dependent<1>();
+    // expected-note@+1 {{in instantiation of function template specialization 'templated_func_type_dependent<int>' requested here}}
+    templated_func_type_dependent<int>();
+}
+
+
diff --git a/clang/test/Parser/cxx2b-lambdas-ext-warns.cpp b/clang/test/Parser/cxx2b-lambdas-ext-warns.cpp
index 7ffb7aae..8c7a778 100644
--- a/clang/test/Parser/cxx2b-lambdas-ext-warns.cpp
+++ b/clang/test/Parser/cxx2b-lambdas-ext-warns.cpp
@@ -1,9 +1,7 @@
-// RUN: %clang_cc1 -std=c++20 %s -verify=cxx20
-// RUN: %clang_cc1 -std=c++23 %s -verify=cxx23
-// RUN: %clang_cc1 -std=c++23 -Wpre-c++23-compat %s -verify=precxx23
-// RUN: %clang_cc1 -std=c++23 -pedantic %s -verify=cxx23
-
-//cxx23-no-diagnostics
+// RUN: %clang_cc1 -triple aarch64-unknown-linux-gnu -target-feature +sme -std=c++20 %s -verify=cxx20
+// RUN: %clang_cc1 -triple aarch64-unknown-linux-gnu -target-feature +sme -std=c++23 %s -verify=cxx23
+// RUN: %clang_cc1 -triple aarch64-unknown-linux-gnu -target-feature +sme -std=c++23 -Wpre-c++23-compat %s -verify=precxx23
+// RUN: %clang_cc1 -triple aarch64-unknown-linux-gnu -target-feature +sme -std=c++23 -pedantic %s -verify=cxx23
 
 auto L1 = [] constexpr {};
 // cxx20-warning@-1 {{lambda without a parameter clause is a C++23 extension}}
@@ -14,3 +12,25 @@ auto L3 = [] static {};
 // cxx20-warning@-1 {{lambda without a parameter clause is a C++23 extension}}
 // cxx20-warning@-2 {{static lambdas are a C++23 extension}}
 // precxx23-warning@-3 {{static lambdas are incompatible with C++ standards before C++23}}
+
+namespace GH161070 {
+void t1() { int a = [] __arm_streaming; }
+// precxx23-error@-1 {{'__arm_streaming' cannot be applied to a declaration}}
+// precxx23-error@-2 {{expected body of lambda expression}}
+// cxx23-error@-3 {{'__arm_streaming' cannot be applied to a declaration}}
+// cxx23-error@-4 {{expected body of lambda expression}}
+// cxx20-error@-5 {{'__arm_streaming' cannot be applied to a declaration}}
+// cxx20-error@-6 {{expected body of lambda expression}}
+// cxx20-warning@-7 {{'__arm_streaming' in this position is a C++23 extension}}
+// precxx23-warning@-8 {{'__arm_streaming' in this position is incompatible with C++ standards before C++23}}
+
+void t2() { int a = [] [[assume(true)]]; }
+// precxx23-error@-1 {{'assume' attribute cannot be applied to a declaration}}
+// precxx23-error@-2 {{expected body of lambda expression}}
+// cxx23-error@-3 {{'assume' attribute cannot be applied to a declaration}}
+// cxx23-error@-4 {{expected body of lambda expression}}
+// cxx20-error@-5 {{'assume' attribute cannot be applied to a declaration}}
+// cxx20-error@-6 {{expected body of lambda expression}}
+// cxx20-warning@-7 {{an attribute specifier sequence in this position is a C++23 extension}}
+// precxx23-warning@-8 {{an attribute specifier sequence in this position is incompatible with C++ standards before C++23}}
+}
diff --git a/clang/test/SemaCXX/amdgpu-image-rsrc.cpp b/clang/test/SemaCXX/amdgpu-image-rsrc.cpp
new file mode 100644
index 0000000..61a82d4
--- /dev/null
+++ b/clang/test/SemaCXX/amdgpu-image-rsrc.cpp
@@ -0,0 +1,17 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: amdgpu-registered-target
+// RUN: %clang_cc1 -fsyntax-only -verify -std=gnu++11 -triple amdgcn -Wno-unused-value %s
+
+void foo() {
+  int n = 100;
+  __amdgpu_texture_t v = 0;           // expected-error {{cannot initialize a variable of type '__amdgpu_texture_t' with an rvalue of type 'int'}}
+  static_cast<__amdgpu_texture_t>(n);  // expected-error {{static_cast from 'int' to '__amdgpu_texture_t' is not allowed}}
+  reinterpret_cast<__amdgpu_texture_t>(n); // expected-error {{reinterpret_cast from 'int' to '__amdgpu_texture_t' is not allowed}}
+  (void)(v + v); // expected-error {{invalid operands to binary expression ('__amdgpu_texture_t' and '__amdgpu_texture_t')}}
+  int x(v);      // expected-error {{cannot initialize a variable of type 'int' with an lvalue of type '__amdgpu_texture_t'}}
+  __amdgpu_texture_t k;
+}
+
+template<class T> void bar(T);
+void use(__amdgpu_texture_t r) { bar(r); }
+struct S { __amdgpu_texture_t r; int a; };
diff --git a/clang/test/SemaCXX/decltype.cpp b/clang/test/SemaCXX/decltype.cpp
index 739485b..45a4c4c 100644
--- a/clang/test/SemaCXX/decltype.cpp
+++ b/clang/test/SemaCXX/decltype.cpp
@@ -1,4 +1,5 @@
 // RUN: %clang_cc1 -std=c++11 -fsyntax-only -verify -Wno-c99-designator %s
+// RUN: %clang_cc1 -std=c++17 -fsyntax-only -verify -Wno-c99-designator %s
 
 // PR5290
 int const f0();
@@ -156,6 +157,8 @@ struct A {
   }
 };
 
+
+
 // This shouldn't crash.
 static_assert(A<int>().f<int>() == 0, "");
 // The result should not be dependent.
@@ -163,6 +166,81 @@ static_assert(A<int>().f<int>() != 0, ""); // expected-error {{static assertion
                                            // expected-note@-1 {{expression evaluates to '0 != 0'}}
 }
 
+
+#if __cplusplus >= 201703L
+namespace GH160497 {
+
+template <class> struct S {
+    template <class>
+    inline static auto mem =
+    [] { static_assert(false); // expected-error {{static assertion failed}} \
+        // expected-note {{while substituting into a lambda expression here}}
+        return 42;
+    }();
+};
+
+using T = decltype(S<void>::mem<void>);
+ // expected-note@-1 {{in instantiation of static data member 'GH160497::S<void>::mem<void>' requested here}}
+
+
+template <class> struct S2 {
+    template <class>
+    inline static auto* mem =
+    [] { static_assert(false); // expected-error {{static assertion failed}} \
+        // expected-note {{while substituting into a lambda expression here}}
+        return static_cast<int*>(nullptr);
+    }();
+};
+
+using T2 = decltype(S2<void>::mem<void>);
+//expected-note@-1 {{in instantiation of static data member 'GH160497::S2<void>::mem<void>' requested here}}
+
+template <class> struct S3 {
+    template <class>
+    inline static int mem =    // Check we don't instantiate when the type is not deduced.
+    [] { static_assert(false);
+        return 42;
+    }();
+};
+
+using T = decltype(S3<void>::mem<void>);
+}
+
+namespace N1 {
+
+template<class>
+struct S {
+  template<class>
+  inline static auto mem = 42;
+};
+
+using T = decltype(S<void>::mem<void>);
+
+T y = 42;
+
+}
+
+namespace GH161196 {
+
+template <typename> struct A {
+  static constexpr int digits = 0;
+};
+
+template <typename> struct B {
+  template <int, typename MaskInt = int, int = A<MaskInt>::digits>
+  static constexpr auto XBitMask = 0;
+};
+
+struct C {
+  using ReferenceHost = B<int>;
+  template <int> static decltype(ReferenceHost::XBitMask<0>) XBitMask;
+};
+
+void test() { (void)C::XBitMask<0>; }
+
+}
+#endif
+
 template<typename>
 class conditional {
 };
diff --git a/clang/test/SemaCXX/invalid-requirement-requires-expr.cpp b/clang/test/SemaCXX/invalid-requirement-requires-expr.cpp
index 097ada3..436dfb9 100644
--- a/clang/test/SemaCXX/invalid-requirement-requires-expr.cpp
+++ b/clang/test/SemaCXX/invalid-requirement-requires-expr.cpp
@@ -17,8 +17,7 @@ constexpr bool A<x>::far() {
       b.data_member;
       requires A<x-1>::far(); // #Invalid
       // expected-error@#Invalid {{recursive template instantiation exceeded maximum depth}}
-      // expected-note@#Invalid {{in instantiation}}
-      // expected-note@#Invalid 2 {{while}}
+      // expected-note@#Invalid 3 {{while}}
       // expected-note@#Invalid {{contexts in backtrace}}
       // expected-note@#Invalid {{increase recursive template instantiation depth}}
     };
diff --git a/clang/test/SemaCXX/type-traits.cpp b/clang/test/SemaCXX/type-traits.cpp
index 3f01247..d49330f 100644
--- a/clang/test/SemaCXX/type-traits.cpp
+++ b/clang/test/SemaCXX/type-traits.cpp
@@ -2038,6 +2038,49 @@ void is_implicit_lifetime(int n) {
   static_assert(__builtin_is_implicit_lifetime(int * __restrict));
 }
 
+namespace GH160610 {
+class NonAggregate {
+public:
+    NonAggregate() = default;
+
+    NonAggregate(const NonAggregate&)            = delete;
+    NonAggregate& operator=(const NonAggregate&) = delete;
+private:
+    int num;
+};
+
+class DataMemberInitializer {
+public:
+    DataMemberInitializer() = default;
+
+    DataMemberInitializer(const DataMemberInitializer&)            = delete;
+    DataMemberInitializer& operator=(const DataMemberInitializer&) = delete;
+private:
+    int num = 0;
+};
+
+class UserProvidedConstructor {
+public:
+    UserProvidedConstructor() {}
+
+    UserProvidedConstructor(const UserProvidedConstructor&)            = delete;
+    UserProvidedConstructor& operator=(const UserProvidedConstructor&) = delete;
+};
+
+static_assert(__builtin_is_implicit_lifetime(NonAggregate));
+static_assert(!__builtin_is_implicit_lifetime(DataMemberInitializer));
+static_assert(!__builtin_is_implicit_lifetime(UserProvidedConstructor));
+
+#if __cplusplus >= 202002L
+template <typename T>
+class Tpl {
+    Tpl() requires false = default ;
+};
+static_assert(!__builtin_is_implicit_lifetime(Tpl<int>));
+
+#endif
+}
+
 void is_signed()
 {
   //static_assert(__is_signed(char));
diff --git a/clang/test/SemaOpenCL/amdgpu-image-rsrc.cl b/clang/test/SemaOpenCL/amdgpu-image-rsrc.cl
new file mode 100644
index 0000000..dc56494
--- /dev/null
+++ b/clang/test/SemaOpenCL/amdgpu-image-rsrc.cl
@@ -0,0 +1,13 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: amdgpu-registered-target
+// RUN: %clang_cc1 -verify -cl-std=CL1.2 -triple amdgcn-amd-amdhsa %s
+// RUN: %clang_cc1 -verify -cl-std=CL2.0 -triple amdgcn-amd-amdhsa %s
+
+void f() {
+    int n = 3;
+    __amdgpu_texture_t v = (__amdgpu_texture_t)0; // expected-error {{used type '__amdgpu_texture_t' where arithmetic or pointer type is required}}
+    int k = v;          // expected-error {{initializing '__private int' with an expression of incompatible type '__private __amdgpu_texture_t'}}
+    (void)(v + v);      // expected-error {{invalid operands}}
+    __amdgpu_texture_t r;
+    int *p = (int*)r;   // expected-error {{operand of type '__amdgpu_texture_t' where arithmetic or pointer type is required}}
+}
diff --git a/clang/test/SemaOpenMP/amdgpu-image-rsrc.cpp b/clang/test/SemaOpenMP/amdgpu-image-rsrc.cpp
new file mode 100644
index 0000000..51b3f72
--- /dev/null
+++ b/clang/test/SemaOpenMP/amdgpu-image-rsrc.cpp
@@ -0,0 +1,12 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: amdgpu-registered-target
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -triple amdgcn-amd-amdhsa -fopenmp-is-target-device -Wno-unused-value %s
+
+void foo() {
+#pragma omp target
+  {
+    int n = 5;
+    __amdgpu_texture_t v = 0; // expected-error {{cannot initialize a variable of type '__amdgpu_texture_t' with an rvalue of type 'int'}}
+    (void)(v + v);            // expected-error {{invalid operands to binary expression}}
+  }
+}
diff --git a/clang/test/SemaTemplate/instantiation-depth-subst-2.cpp b/clang/test/SemaTemplate/instantiation-depth-subst-2.cpp
index 2b519e9..66fd1af 100644
--- a/clang/test/SemaTemplate/instantiation-depth-subst-2.cpp
+++ b/clang/test/SemaTemplate/instantiation-depth-subst-2.cpp
@@ -2,5 +2,6 @@
 
 template<int N> struct S { };
 template<typename T> S<T() + T()> operator+(T, T); // expected-error {{instantiation exceeded maximum depth}} expected-note 2{{while substituting}}
+// expected-note@-1 {{use -ftemplate-depth=N to increase recursive template instantiation depth}}
 S<0> s;
 int k = s + s; // expected-note {{while substituting}}
diff --git a/clang/test/SemaTemplate/instantiation-depth-subst.cpp b/clang/test/SemaTemplate/instantiation-depth-subst.cpp
index 062a8ed..17944bc 100644
--- a/clang/test/SemaTemplate/instantiation-depth-subst.cpp
+++ b/clang/test/SemaTemplate/instantiation-depth-subst.cpp
@@ -3,7 +3,8 @@
 // PR9793
 template<typename T> auto f(T t) -> decltype(f(t)); // \
 // expected-error {{recursive template instantiation exceeded maximum depth of 2}} \
-// expected-note 2 {{while substituting}}
+// expected-note 2 {{while substituting}} \
+// expected-note {{use -ftemplate-depth=N to increase recursive template instantiation depth}}
 
 struct S {};
 int k = f(S{}); // expected-note {{while substituting}}
diff --git a/clang/tools/clang-scan-deps/ClangScanDeps.cpp b/clang/tools/clang-scan-deps/ClangScanDeps.cpp
index 0e2758d..e41f4eb 100644
--- a/clang/tools/clang-scan-deps/ClangScanDeps.cpp
+++ b/clang/tools/clang-scan-deps/ClangScanDeps.cpp
@@ -420,7 +420,7 @@ public:
     std::vector<ModuleDeps *> NewMDs;
     {
       std::unique_lock<std::mutex> ul(Lock);
-      for (const ModuleDeps &MD : Graph) {
+      for (ModuleDeps &MD : Graph) {
         auto I = Modules.find({MD.ID, 0});
         if (I != Modules.end()) {
           I->first.InputIndex = std::min(I->first.InputIndex, InputIndex);
diff --git a/clang/tools/libclang/CIndex.cpp b/clang/tools/libclang/CIndex.cpp
index 5aab743..30e2be7 100644
--- a/clang/tools/libclang/CIndex.cpp
+++ b/clang/tools/libclang/CIndex.cpp
@@ -2148,6 +2148,9 @@ public:
   void VisitOMPUnrollDirective(const OMPUnrollDirective *D);
   void VisitOMPReverseDirective(const OMPReverseDirective *D);
   void VisitOMPInterchangeDirective(const OMPInterchangeDirective *D);
+  void VisitOMPCanonicalLoopSequenceTransformationDirective(
+      const OMPCanonicalLoopSequenceTransformationDirective *D);
+  void VisitOMPFuseDirective(const OMPFuseDirective *D);
   void VisitOMPForDirective(const OMPForDirective *D);
   void VisitOMPForSimdDirective(const OMPForSimdDirective *D);
   void VisitOMPSectionsDirective(const OMPSectionsDirective *D);
@@ -2353,6 +2356,11 @@ void OMPClauseEnqueue::VisitOMPPartialClause(const OMPPartialClause *C) {
   Visitor->AddStmt(C->getFactor());
 }
 
+void OMPClauseEnqueue::VisitOMPLoopRangeClause(const OMPLoopRangeClause *C) {
+  Visitor->AddStmt(C->getFirst());
+  Visitor->AddStmt(C->getCount());
+}
+
 void OMPClauseEnqueue::VisitOMPAllocatorClause(const OMPAllocatorClause *C) {
   Visitor->AddStmt(C->getAllocator());
 }
@@ -3317,6 +3325,15 @@ void EnqueueVisitor::VisitOMPInterchangeDirective(
   VisitOMPCanonicalLoopNestTransformationDirective(D);
 }
 
+void EnqueueVisitor::VisitOMPCanonicalLoopSequenceTransformationDirective(
+    const OMPCanonicalLoopSequenceTransformationDirective *D) {
+  VisitOMPExecutableDirective(D);
+}
+
+void EnqueueVisitor::VisitOMPFuseDirective(const OMPFuseDirective *D) {
+  VisitOMPCanonicalLoopSequenceTransformationDirective(D);
+}
+
 void EnqueueVisitor::VisitOMPForDirective(const OMPForDirective *D) {
   VisitOMPLoopDirective(D);
 }
@@ -6275,6 +6292,8 @@ CXString clang_getCursorKindSpelling(enum CXCursorKind Kind) {
     return cxstring::createRef("OMPReverseDirective");
   case CXCursor_OMPInterchangeDirective:
     return cxstring::createRef("OMPInterchangeDirective");
+  case CXCursor_OMPFuseDirective:
+    return cxstring::createRef("OMPFuseDirective");
   case CXCursor_OMPForDirective:
     return cxstring::createRef("OMPForDirective");
   case CXCursor_OMPForSimdDirective:
diff --git a/clang/tools/libclang/CXCursor.cpp b/clang/tools/libclang/CXCursor.cpp
index 3c40624..56f113c 100644
--- a/clang/tools/libclang/CXCursor.cpp
+++ b/clang/tools/libclang/CXCursor.cpp
@@ -687,6 +687,9 @@ CXCursor cxcursor::MakeCXCursor(const Stmt *S, const Decl *Parent,
   case Stmt::OMPInterchangeDirectiveClass:
     K = CXCursor_OMPInterchangeDirective;
     break;
+  case Stmt::OMPFuseDirectiveClass:
+    K = CXCursor_OMPFuseDirective;
+    break;
   case Stmt::OMPForDirectiveClass:
     K = CXCursor_OMPForDirective;
     break;
diff --git a/clang/unittests/Format/TokenAnnotatorTest.cpp b/clang/unittests/Format/TokenAnnotatorTest.cpp
index 899cc47..4a8f27f 100644
--- a/clang/unittests/Format/TokenAnnotatorTest.cpp
+++ b/clang/unittests/Format/TokenAnnotatorTest.cpp
@@ -2237,6 +2237,12 @@ TEST_F(TokenAnnotatorTest, UnderstandsLambdas) {
   ASSERT_EQ(Tokens.size(), 21u) << Tokens;
   EXPECT_TOKEN(Tokens[11], tok::l_square, TT_LambdaLSquare);
   EXPECT_TOKEN(Tokens[13], tok::l_brace, TT_LambdaLBrace);
+
+  Tokens = annotate("SomeFunction({[]() -> int *[] { return {}; }});");
+  ASSERT_EQ(Tokens.size(), 22u) << Tokens;
+  EXPECT_TOKEN(Tokens[3], tok::l_square, TT_LambdaLSquare);
+  EXPECT_TOKEN(Tokens[5], tok::l_paren, TT_LambdaDefinitionLParen);
+  EXPECT_TOKEN(Tokens[10], tok::l_square, TT_ArraySubscriptLSquare);
 }
 
 TEST_F(TokenAnnotatorTest, UnderstandsFunctionAnnotations) {
@@ -4159,6 +4165,14 @@ TEST_F(TokenAnnotatorTest, LineCommentTrailingBackslash) {
   EXPECT_TOKEN(Tokens[1], tok::comment, TT_LineComment);
 }
 
+TEST_F(TokenAnnotatorTest, ArrowAfterSubscript) {
+  auto Tokens =
+      annotate("return (getStructType()->getElements())[eIdx]->getName();");
+  ASSERT_EQ(Tokens.size(), 19u) << Tokens;
+  // Not TT_LambdaArrow.
+  EXPECT_TOKEN(Tokens[13], tok::arrow, TT_Unknown);
+}
+
 TEST_F(TokenAnnotatorTest, QtProperty) {
   auto Style = getLLVMStyle();
   Style.AllowBreakBeforeQtProperty = true;
diff --git a/compiler-rt/include/xray/xray_interface.h b/compiler-rt/include/xray/xray_interface.h
index 675ea0c..3ef8ee3 100644
--- a/compiler-rt/include/xray/xray_interface.h
+++ b/compiler-rt/include/xray/xray_interface.h
@@ -1,4 +1,4 @@
-//===- xray_interface.h -----------------------------------------*- C++ -*-===//
+//===- xray_interface.h ---------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -14,10 +14,17 @@
 #ifndef XRAY_XRAY_INTERFACE_H
 #define XRAY_XRAY_INTERFACE_H
 
+#ifdef __cplusplus
 #include <cstddef>
 #include <cstdint>
+#else
+#include <stddef.h>
+#include <stdint.h>
+#endif
 
+#ifdef __cplusplus
 extern "C" {
+#endif
 
 /// Synchronize this with AsmPrinter::SledKind in LLVM.
 enum XRayEntryType {
@@ -49,7 +56,7 @@ enum XRayEntryType {
 /// achieved by marking them all with: __attribute__((xray_never_instrument))
 ///
 /// Returns 1 on success, 0 on error.
-extern int __xray_set_handler(void (*entry)(int32_t, XRayEntryType));
+extern int __xray_set_handler(void (*entry)(int32_t, enum XRayEntryType));
 
 /// This removes whatever the currently provided handler is. Returns 1 on
 /// success, 0 on error.
@@ -60,7 +67,7 @@ extern int __xray_remove_handler();
 /// start logging their subsequent affected function calls (if patched).
 ///
 /// Returns 1 on success, 0 on error.
-extern int __xray_set_handler_arg1(void (*entry)(int32_t, XRayEntryType,
+extern int __xray_set_handler_arg1(void (*entry)(int32_t, enum XRayEntryType,
                                                  uint64_t));
 
 /// Disables the XRay handler used to log first arguments of function calls.
@@ -68,7 +75,7 @@ extern int __xray_set_handler_arg1(void (*entry)(int32_t, XRayEntryType,
 extern int __xray_remove_handler_arg1();
 
 /// Provide a function to invoke when XRay encounters a custom event.
-extern int __xray_set_customevent_handler(void (*entry)(void *, std::size_t));
+extern int __xray_set_customevent_handler(void (*entry)(void *, size_t));
 
 /// This removes whatever the currently provided custom event handler is.
 /// Returns 1 on success, 0 on error.
@@ -95,39 +102,39 @@ enum XRayPatchingStatus {
 
 /// This tells XRay to patch the instrumentation points in all currently loaded
 /// objects. See XRayPatchingStatus for possible result values.
-extern XRayPatchingStatus __xray_patch();
+extern enum XRayPatchingStatus __xray_patch();
 
 /// This tells XRay to patch the instrumentation points in the given object.
 /// See XRayPatchingStatus for possible result values.
-extern XRayPatchingStatus __xray_patch_object(int32_t ObjId);
+extern enum XRayPatchingStatus __xray_patch_object(int32_t ObjId);
 
 /// Reverses the effect of __xray_patch(). See XRayPatchingStatus for possible
 /// result values.
-extern XRayPatchingStatus __xray_unpatch();
+extern enum XRayPatchingStatus __xray_unpatch();
 
 /// Reverses the effect of __xray_patch_object. See XRayPatchingStatus for
 /// possible result values.
-extern XRayPatchingStatus __xray_unpatch_object(int32_t ObjId);
+extern enum XRayPatchingStatus __xray_unpatch_object(int32_t ObjId);
 
 /// This unpacks the given (packed) function id and patches
 /// the corresponding function.  See XRayPatchingStatus for possible
 /// result values.
-extern XRayPatchingStatus __xray_patch_function(int32_t FuncId);
+extern enum XRayPatchingStatus __xray_patch_function(int32_t FuncId);
 
 /// This patches a specific function in the given object. See XRayPatchingStatus
 /// for possible result values.
-extern XRayPatchingStatus __xray_patch_function_in_object(int32_t FuncId,
-                                                          int32_t ObjId);
+extern enum XRayPatchingStatus __xray_patch_function_in_object(int32_t FuncId,
+                                                               int32_t ObjId);
 
 /// This unpacks the given (packed) function id and unpatches
 /// the corresponding function. See XRayPatchingStatus for possible
 /// result values.
-extern XRayPatchingStatus __xray_unpatch_function(int32_t FuncId);
+extern enum XRayPatchingStatus __xray_unpatch_function(int32_t FuncId);
 
 /// This unpatches a specific function in the given object.
 /// See XRayPatchingStatus for possible result values.
-extern XRayPatchingStatus __xray_unpatch_function_in_object(int32_t FuncId,
-                                                            int32_t ObjId);
+extern enum XRayPatchingStatus __xray_unpatch_function_in_object(int32_t FuncId,
+                                                                 int32_t ObjId);
 
 /// This function unpacks the given (packed) function id and returns the address
 /// of the corresponding function. We return 0 if we encounter any error, even
@@ -173,6 +180,8 @@ extern int32_t __xray_pack_id(int32_t FuncId, int32_t ObjId);
 /// Calling __xray_init() more than once is safe across multiple threads.
 extern void __xray_init();
 
+#ifdef __cplusplus
 } // end extern "C"
+#endif
 
 #endif // XRAY_XRAY_INTERFACE_H
diff --git a/compiler-rt/test/builtins/Unit/fixunstfdi_test.c b/compiler-rt/test/builtins/Unit/fixunstfdi_test.c
index 982f3a4..526ba5c 100644
--- a/compiler-rt/test/builtins/Unit/fixunstfdi_test.c
+++ b/compiler-rt/test/builtins/Unit/fixunstfdi_test.c
@@ -1,24 +1,25 @@
-// XFAIL: target=aarch64-{{.*}}-windows-{{.*}}
 // RUN: %clang_builtins %s %librt -o %t && %run %t
 // REQUIRES: librt_has_fixunstfdi
 
 #include <stdio.h>
+#include "int_lib.h"
 
-#if _ARCH_PPC || __aarch64__ || __arm64ec__
+#if defined(CRT_HAS_TF_MODE)
 
-#include "int_lib.h"
+#define QUAD_PRECISION
+#include "fp_lib.h"
 
 // Returns: convert a to a unsigned long long, rounding toward zero.
 //          Negative values all become zero.
 
-// Assumption: long double is a 128 bit floating point type
+// Assumption: fp_t is a 128 bit floating point type
 //             du_int is a 64 bit integral type
-//             value in long double is representable in du_int or is negative 
+//             value in fp_t is representable in du_int or is negative 
 //                 (no range checking performed)
 
-COMPILER_RT_ABI du_int __fixunstfdi(long double a);
+COMPILER_RT_ABI du_int __fixunstfdi(fp_t a);
 
-int test__fixunstfdi(long double a, du_int expected)
+int test__fixunstfdi(fp_t a, du_int expected)
 {
     du_int x = __fixunstfdi(a);
     if (x != expected)
@@ -29,13 +30,13 @@ int test__fixunstfdi(long double a, du_int expected)
 
 char assumption_1[sizeof(du_int) == 2*sizeof(su_int)] = {0};
 char assumption_2[sizeof(du_int)*CHAR_BIT == 64] = {0};
-char assumption_3[sizeof(long double)*CHAR_BIT == 128] = {0};
+char assumption_3[sizeof(fp_t)*CHAR_BIT == 128] = {0};
 
 #endif
 
 int main()
 {
-#if _ARCH_PPC || __aarch64__ || __arm64ec__
+#if defined(CRT_HAS_TF_MODE)
     if (test__fixunstfdi(0.0, 0))
         return 1;
 
diff --git a/compiler-rt/test/builtins/Unit/multc3_test.c b/compiler-rt/test/builtins/Unit/multc3_test.c
index e9c99a7..18561cc 100644
--- a/compiler-rt/test/builtins/Unit/multc3_test.c
+++ b/compiler-rt/test/builtins/Unit/multc3_test.c
@@ -1,24 +1,26 @@
-// XFAIL: target=aarch64-{{.*}}-windows-{{.*}}
 // RUN: %clang_builtins %s %librt -o %t && %run %t
 // REQUIRES: librt_has_multc3
 
 #include <stdio.h>
+#include "int_lib.h"
 
-#if _ARCH_PPC || __aarch64__ || __arm64ec__
+#if defined(CRT_HAS_128BIT) && defined(CRT_HAS_F128)
+
+#define QUAD_PRECISION
+#include "fp_lib.h"
 
-#include "int_lib.h"
 #include <math.h>
 #include <complex.h>
 
 // Returns: the product of a + ib and c + id
 
-COMPILER_RT_ABI long double _Complex
-__multc3(long double __a, long double __b, long double __c, long double __d);
+COMPILER_RT_ABI Qcomplex
+__multc3(fp_t __a, fp_t __b, fp_t __c, fp_t __d);
 
 enum {zero, non_zero, inf, NaN, non_zero_nan};
 
 int
-classify(long double _Complex x)
+classify(Qcomplex x)
 {
     if (x == 0)
         return zero;
@@ -41,13 +43,13 @@ classify(long double _Complex x)
     return non_zero;
 }
 
-int test__multc3(long double a, long double b, long double c, long double d)
+int test__multc3(fp_t a, fp_t b, fp_t c, fp_t d)
 {
-    long double _Complex r = __multc3(a, b, c, d);
+    Qcomplex r = __multc3(a, b, c, d);
 //     printf("test__multc3(%Lf, %Lf, %Lf, %Lf) = %Lf + I%Lf\n",
 //             a, b, c, d, creall(r), cimagl(r));
-	long double _Complex dividend;
-	long double _Complex divisor;
+	Qcomplex dividend;
+	Qcomplex divisor;
 	
 	__real__ dividend = a;
 	__imag__ dividend = b;
@@ -188,7 +190,7 @@ int test__multc3(long double a, long double b, long double c, long double d)
     return 0;
 }
 
-long double x[][2] =
+fp_t x[][2] =
 {
     { 1.e-6,  1.e-6},
     {-1.e-6,  1.e-6},
@@ -348,7 +350,7 @@ long double x[][2] =
 
 int main()
 {
-#if _ARCH_PPC || __aarch64__ || __arm64ec__
+#if defined(CRT_HAS_128BIT) && defined(CRT_HAS_F128)
     const unsigned N = sizeof(x) / sizeof(x[0]);
     unsigned i, j;
     for (i = 0; i < N; ++i)
diff --git a/compiler-rt/test/xray/TestCases/Posix/patching-unpatching.c b/compiler-rt/test/xray/TestCases/Posix/patching-unpatching.c
new file mode 100644
index 0000000..2dbc681
--- /dev/null
+++ b/compiler-rt/test/xray/TestCases/Posix/patching-unpatching.c
@@ -0,0 +1,56 @@
+// Check that we can patch and un-patch on demand, and that logging gets invoked
+// appropriately.
+//
+// Do not run on powerpc64le, as linking XRay with C compiler causes linker error
+// due to std::__throw_system_error(int) being present in XRay libraries.
+// See https://github.com/llvm/llvm-project/issues/141598
+//
+// RUN: %clang_xray -fxray-instrument -std=c23 %s -o %t
+// RUN: env XRAY_OPTIONS="patch_premain=false" %run %t 2>&1 | FileCheck %s
+// RUN: %clang_xray -fxray-instrument -fno-xray-function-index -std=c23 %s -o %t
+// RUN: env XRAY_OPTIONS="patch_premain=false" %run %t 2>&1 | FileCheck %s
+
+// UNSUPPORTED: target-is-mips64,target-is-mips64el
+// UNSUPPORTED: target=powerpc64le-{{.*}}
+
+#include "xray/xray_interface.h"
+
+#include <stdio.h>
+
+bool called = false;
+
+void test_handler(int32_t fid, enum XRayEntryType type) {
+  printf("called: %d, type=%d\n", fid, (int32_t)(type));
+  called = true;
+}
+
+[[clang::xray_always_instrument]] void always_instrument() {
+  printf("always instrumented called\n");
+}
+
+int main() {
+  __xray_set_handler(test_handler);
+  always_instrument();
+  // CHECK: always instrumented called
+  auto status = __xray_patch();
+  printf("patching status: %d\n", (int32_t)status);
+  // CHECK-NEXT: patching status: 1
+  always_instrument();
+  // CHECK-NEXT: called: {{.*}}, type=0
+  // CHECK-NEXT: always instrumented called
+  // CHECK-NEXT: called: {{.*}}, type=1
+  status = __xray_unpatch();
+  printf("patching status: %d\n", (int32_t)status);
+  // CHECK-NEXT: patching status: 1
+  always_instrument();
+  // CHECK-NEXT: always instrumented called
+  status = __xray_patch();
+  printf("patching status: %d\n", (int32_t)status);
+  // CHECK-NEXT: patching status: 1
+  __xray_remove_handler();
+  always_instrument();
+  // CHECK-NEXT: always instrumented called
+  status = __xray_unpatch();
+  printf("patching status: %d\n", (int32_t)status);
+  // CHECK-NEXT: patching status: 1
+}
diff --git a/flang/include/flang/Lower/OpenMP/Clauses.h b/flang/include/flang/Lower/OpenMP/Clauses.h
index 5267a58..5cd196a 100644
--- a/flang/include/flang/Lower/OpenMP/Clauses.h
+++ b/flang/include/flang/Lower/OpenMP/Clauses.h
@@ -243,6 +243,7 @@ using Initializer = tomp::clause::InitializerT<TypeTy, IdTy, ExprTy>;
 using InReduction = tomp::clause::InReductionT<TypeTy, IdTy, ExprTy>;
 using IsDevicePtr = tomp::clause::IsDevicePtrT<TypeTy, IdTy, ExprTy>;
 using Lastprivate = tomp::clause::LastprivateT<TypeTy, IdTy, ExprTy>;
+using LoopRange = tomp::clause::LoopRangeT<TypeTy, IdTy, ExprTy>;
 using Linear = tomp::clause::LinearT<TypeTy, IdTy, ExprTy>;
 using Link = tomp::clause::LinkT<TypeTy, IdTy, ExprTy>;
 using Map = tomp::clause::MapT<TypeTy, IdTy, ExprTy>;
diff --git a/flang/include/flang/Parser/dump-parse-tree.h b/flang/include/flang/Parser/dump-parse-tree.h
index fadca0a..1488529 100644
--- a/flang/include/flang/Parser/dump-parse-tree.h
+++ b/flang/include/flang/Parser/dump-parse-tree.h
@@ -613,6 +613,7 @@ public:
   NODE_ENUM(OmpLinearModifier, Value)
   NODE(parser, OmpLocator)
   NODE(parser, OmpLocatorList)
+  NODE(parser, OmpLoopRangeClause)
   NODE(parser, OmpMapClause)
   NODE(OmpMapClause, Modifier)
   NODE(parser, OmpMapper)
diff --git a/flang/include/flang/Parser/parse-tree.h b/flang/include/flang/Parser/parse-tree.h
index 8b23189..325ca9b 100644
--- a/flang/include/flang/Parser/parse-tree.h
+++ b/flang/include/flang/Parser/parse-tree.h
@@ -4546,6 +4546,15 @@ struct OmpLinearClause {
   std::tuple<OmpObjectList, MODIFIERS(), /*PostModified=*/bool> t;
 };
 
+// Ref: [6.0:207-208]
+//
+// loop-range-clause ->
+//    LOOPRANGE(first, count)                       // since 6.0
+struct OmpLoopRangeClause {
+  TUPLE_CLASS_BOILERPLATE(OmpLoopRangeClause);
+  std::tuple<ScalarIntConstantExpr, ScalarIntConstantExpr> t;
+};
+
 // Ref: [4.5:216-219], [5.0:315-324], [5.1:347-355], [5.2:150-158]
 //
 // map-clause ->
diff --git a/flang/lib/Lower/OpenMP/Clauses.cpp b/flang/lib/Lower/OpenMP/Clauses.cpp
index 48b90cc..fac37a3 100644
--- a/flang/lib/Lower/OpenMP/Clauses.cpp
+++ b/flang/lib/Lower/OpenMP/Clauses.cpp
@@ -1036,6 +1036,11 @@ Link make(const parser::OmpClause::Link &inp,
   return Link{/*List=*/makeObjects(inp.v, semaCtx)};
 }
 
+LoopRange make(const parser::OmpClause::Looprange &inp,
+               semantics::SemanticsContext &semaCtx) {
+  llvm_unreachable("Unimplemented: looprange");
+}
+
 Map make(const parser::OmpClause::Map &inp,
          semantics::SemanticsContext &semaCtx) {
   // inp.v -> parser::OmpMapClause
diff --git a/flang/lib/Optimizer/HLFIR/Transforms/SimplifyHLFIRIntrinsics.cpp b/flang/lib/Optimizer/HLFIR/Transforms/SimplifyHLFIRIntrinsics.cpp
index d8e36ea..9969ee4 100644
--- a/flang/lib/Optimizer/HLFIR/Transforms/SimplifyHLFIRIntrinsics.cpp
+++ b/flang/lib/Optimizer/HLFIR/Transforms/SimplifyHLFIRIntrinsics.cpp
@@ -2284,6 +2284,213 @@ public:
   }
 };
 
+static std::pair<mlir::Value, hlfir::AssociateOp>
+getVariable(fir::FirOpBuilder &builder, mlir::Location loc, mlir::Value val) {
+  // If it is an expression - create a variable from it, or forward
+  // the value otherwise.
+  hlfir::AssociateOp associate;
+  if (!mlir::isa<hlfir::ExprType>(val.getType()))
+    return {val, associate};
+  hlfir::Entity entity{val};
+  mlir::NamedAttribute byRefAttr = fir::getAdaptToByRefAttr(builder);
+  associate = hlfir::genAssociateExpr(loc, builder, entity, entity.getType(),
+                                      "", byRefAttr);
+  return {associate.getBase(), associate};
+}
+
+class IndexOpConversion : public mlir::OpRewritePattern<hlfir::IndexOp> {
+public:
+  using mlir::OpRewritePattern<hlfir::IndexOp>::OpRewritePattern;
+
+  llvm::LogicalResult
+  matchAndRewrite(hlfir::IndexOp op,
+                  mlir::PatternRewriter &rewriter) const override {
+    // We simplify only limited cases:
+    // 1) a substring length shall be known at compile time
+    // 2) if a substring length is 0 then replace with 1 for forward search,
+    //    or otherwise with the string length + 1 (builder shall const-fold if
+    //    lookup direction is known at compile time).
+    // 3) for known string length at compile time, if it is
+    //    shorter than substring  => replace with zero.
+    // 4) if a substring length is one => inline as simple search loop
+    // 5) for forward search with input strings of kind=1 runtime is faster.
+    // Do not simplify in all the other cases relying on a runtime call.
+
+    fir::FirOpBuilder builder{rewriter, op.getOperation()};
+    const mlir::Location &loc = op->getLoc();
+
+    auto resultTy = op.getType();
+    mlir::Value back = op.getBack();
+    mlir::Value substrLen =
+        hlfir::genCharLength(loc, builder, hlfir::Entity{op.getSubstr()});
+
+    auto substrLenCst = fir::getIntIfConstant(substrLen);
+    if (!substrLenCst) {
+      return rewriter.notifyMatchFailure(
+          op, "substring length unknown at compile time");
+    }
+    mlir::Value strLen =
+        hlfir::genCharLength(loc, builder, hlfir::Entity{op.getStr()});
+    auto i1Ty = builder.getI1Type();
+    auto idxTy = builder.getIndexType();
+    if (*substrLenCst == 0) {
+      mlir::Value oneIdx = builder.createIntegerConstant(loc, idxTy, 1);
+      // zero length substring. For back search replace with
+      // strLen+1, or otherwise with 1.
+      mlir::Value strEnd = mlir::arith::AddIOp::create(
+          builder, loc, builder.createConvert(loc, idxTy, strLen), oneIdx);
+      if (back)
+        back = builder.createConvert(loc, i1Ty, back);
+      else
+        back = builder.createIntegerConstant(loc, i1Ty, 0);
+      mlir::Value result =
+          mlir::arith::SelectOp::create(builder, loc, back, strEnd, oneIdx);
+
+      rewriter.replaceOp(op, builder.createConvert(loc, resultTy, result));
+      return mlir::success();
+    }
+
+    if (auto strLenCst = fir::getIntIfConstant(strLen)) {
+      if (*strLenCst < *substrLenCst) {
+        rewriter.replaceOp(op, builder.createIntegerConstant(loc, resultTy, 0));
+        return mlir::success();
+      }
+      if (*strLenCst == 0) {
+        // both strings have zero length
+        rewriter.replaceOp(op, builder.createIntegerConstant(loc, resultTy, 1));
+        return mlir::success();
+      }
+    }
+    if (*substrLenCst != 1) {
+      return rewriter.notifyMatchFailure(
+          op, "rely on runtime implementation if substring length > 1");
+    }
+    // For forward search and character kind=1 the runtime uses memchr
+    // which well optimized. But it looks like memchr idiom is not recognized
+    // in LLVM yet. On a micro-kernel test with strings of length 40 runtime
+    // had ~2x less execution time vs inlined code. For unknown search direction
+    // at compile time pessimistically assume "forward".
+    std::optional<bool> isBack;
+    if (back) {
+      if (auto backCst = fir::getIntIfConstant(back))
+        isBack = *backCst != 0;
+    } else {
+      isBack = false;
+    }
+    auto charTy = mlir::cast<fir::CharacterType>(
+        hlfir::getFortranElementType(op.getSubstr().getType()));
+    unsigned kind = charTy.getFKind();
+    if (kind == 1 && (!isBack || !*isBack)) {
+      return rewriter.notifyMatchFailure(
+          op, "rely on runtime implementation for character kind 1");
+    }
+
+    // All checks are passed here. Generate single character search loop.
+    auto [strV, strAssociate] = getVariable(builder, loc, op.getStr());
+    auto [substrV, substrAssociate] = getVariable(builder, loc, op.getSubstr());
+    hlfir::Entity str{strV};
+    hlfir::Entity substr{substrV};
+    mlir::Value oneIdx = builder.createIntegerConstant(loc, idxTy, 1);
+
+    auto genExtractAndConvertToInt = [&charTy, &idxTy, &oneIdx,
+                                      kind](mlir::Location loc,
+                                            fir::FirOpBuilder &builder,
+                                            hlfir::Entity &charStr,
+                                            mlir::Value index) {
+      auto bits = builder.getKindMap().getCharacterBitsize(kind);
+      auto intTy = builder.getIntegerType(bits);
+      auto charLen1Ty =
+          fir::CharacterType::getSingleton(builder.getContext(), kind);
+      mlir::Type designatorTy =
+          fir::ReferenceType::get(charLen1Ty, fir::isa_volatile_type(charTy));
+      auto idxAttr = builder.getIntegerAttr(idxTy, 0);
+
+      auto singleChr = hlfir::DesignateOp::create(
+          builder, loc, designatorTy, charStr, /*component=*/{},
+          /*compShape=*/mlir::Value{}, hlfir::DesignateOp::Subscripts{},
+          /*substring=*/mlir::ValueRange{index, index},
+          /*complexPart=*/std::nullopt,
+          /*shape=*/mlir::Value{}, /*typeParams=*/mlir::ValueRange{oneIdx},
+          fir::FortranVariableFlagsAttr{});
+      auto chrVal = fir::LoadOp::create(builder, loc, singleChr);
+      mlir::Value intVal = fir::ExtractValueOp::create(
+          builder, loc, intTy, chrVal, builder.getArrayAttr(idxAttr));
+      return intVal;
+    };
+
+    auto wantChar = genExtractAndConvertToInt(loc, builder, substr, oneIdx);
+
+    // Generate search loop body with the following C equivalent:
+    //  idx_t result = 0;
+    //  idx_t end = strlen + 1;
+    //  char want = substr[0];
+    //  for (idx_t idx = 1; idx < end; ++idx) {
+    //    if (result == 0) {
+    //        idx_t at = back ? end - idx: idx;
+    //        result = str[at-1] == want ? at : result;
+    //    }
+    //  }
+    if (!back)
+      back = builder.createIntegerConstant(loc, i1Ty, 0);
+    else
+      back = builder.createConvert(loc, i1Ty, back);
+    mlir::Value strEnd = mlir::arith::AddIOp::create(
+        builder, loc, builder.createConvert(loc, idxTy, strLen), oneIdx);
+    mlir::Value zeroIdx = builder.createIntegerConstant(loc, idxTy, 0);
+    auto genSearchBody = [&](mlir::Location loc, fir::FirOpBuilder &builder,
+                             mlir::ValueRange index,
+                             mlir::ValueRange reductionArgs)
+        -> llvm::SmallVector<mlir::Value, 1> {
+      assert(index.size() == 1 && "expected single loop");
+      assert(reductionArgs.size() == 1 && "expected single reduction value");
+      mlir::Value inRes = reductionArgs[0];
+      auto resEQzero = mlir::arith::CmpIOp::create(
+          builder, loc, mlir::arith::CmpIPredicate::eq, inRes, zeroIdx);
+
+      mlir::Value res =
+          builder
+              .genIfOp(loc, {idxTy}, resEQzero,
+                       /*withElseRegion=*/true)
+              .genThen([&]() {
+                mlir::Value idx = builder.createConvert(loc, idxTy, index[0]);
+                // offset = back ? end - idx : idx;
+                mlir::Value offset = mlir::arith::SelectOp::create(
+                    builder, loc, back,
+                    mlir::arith::SubIOp::create(builder, loc, strEnd, idx),
+                    idx);
+
+                auto haveChar =
+                    genExtractAndConvertToInt(loc, builder, str, offset);
+                auto charsEQ = mlir::arith::CmpIOp::create(
+                    builder, loc, mlir::arith::CmpIPredicate::eq, haveChar,
+                    wantChar);
+                mlir::Value newVal = mlir::arith::SelectOp::create(
+                    builder, loc, charsEQ, offset, inRes);
+
+                fir::ResultOp::create(builder, loc, newVal);
+              })
+              .genElse([&]() { fir::ResultOp::create(builder, loc, inRes); })
+              .getResults()[0];
+      return {res};
+    };
+
+    llvm::SmallVector<mlir::Value, 1> loopOut =
+        hlfir::genLoopNestWithReductions(loc, builder, {strLen},
+                                         /*reductionInits=*/{zeroIdx},
+                                         genSearchBody,
+                                         /*isUnordered=*/false);
+    mlir::Value result = builder.createConvert(loc, resultTy, loopOut[0]);
+
+    if (strAssociate)
+      hlfir::EndAssociateOp::create(builder, loc, strAssociate);
+    if (substrAssociate)
+      hlfir::EndAssociateOp::create(builder, loc, substrAssociate);
+
+    rewriter.replaceOp(op, result);
+    return mlir::success();
+  }
+};
+
 template <typename Op>
 class MatmulConversion : public mlir::OpRewritePattern<Op> {
 public:
@@ -2955,6 +3162,7 @@ public:
     patterns.insert<ArrayShiftConversion<hlfir::CShiftOp>>(context);
     patterns.insert<ArrayShiftConversion<hlfir::EOShiftOp>>(context);
     patterns.insert<CmpCharOpConversion>(context);
+    patterns.insert<IndexOpConversion>(context);
     patterns.insert<MatmulConversion<hlfir::MatmulTransposeOp>>(context);
     patterns.insert<ReductionConversion<hlfir::CountOp>>(context);
     patterns.insert<ReductionConversion<hlfir::AnyOp>>(context);
diff --git a/flang/lib/Parser/openmp-parsers.cpp b/flang/lib/Parser/openmp-parsers.cpp
index ea09fe0..9507021 100644
--- a/flang/lib/Parser/openmp-parsers.cpp
+++ b/flang/lib/Parser/openmp-parsers.cpp
@@ -1023,6 +1023,9 @@ TYPE_PARSER(
         maybe(":"_tok >> nonemptyList(Parser<OmpLinearClause::Modifier>{})),
         /*PostModified=*/pure(true)))
 
+TYPE_PARSER(construct<OmpLoopRangeClause>(
+    scalarIntConstantExpr, "," >> scalarIntConstantExpr))
+
 // OpenMPv5.2 12.5.2 detach-clause -> DETACH (event-handle)
 TYPE_PARSER(construct<OmpDetachClause>(Parser<OmpObject>{}))
 
@@ -1207,6 +1210,8 @@ TYPE_PARSER( //
                     parenthesized(Parser<OmpLinearClause>{}))) ||
     "LINK" >> construct<OmpClause>(construct<OmpClause::Link>(
                   parenthesized(Parser<OmpObjectList>{}))) ||
+    "LOOPRANGE" >> construct<OmpClause>(construct<OmpClause::Looprange>(
+                       parenthesized(Parser<OmpLoopRangeClause>{}))) ||
     "MAP" >> construct<OmpClause>(construct<OmpClause::Map>(
                  parenthesized(Parser<OmpMapClause>{}))) ||
     "MATCH" >> construct<OmpClause>(construct<OmpClause::Match>(
diff --git a/flang/lib/Parser/unparse.cpp b/flang/lib/Parser/unparse.cpp
index 0fbd347..0511f5b 100644
--- a/flang/lib/Parser/unparse.cpp
+++ b/flang/lib/Parser/unparse.cpp
@@ -2345,6 +2345,13 @@ public:
       }
     }
   }
+  void Unparse(const OmpLoopRangeClause &x) {
+    Word("LOOPRANGE(");
+    Walk(std::get<0>(x.t));
+    Put(", ");
+    Walk(std::get<1>(x.t));
+    Put(")");
+  }
   void Unparse(const OmpReductionClause &x) {
     using Modifier = OmpReductionClause::Modifier;
     Walk(std::get<std::optional<std::list<Modifier>>>(x.t), ": ");
diff --git a/flang/lib/Semantics/check-omp-structure.cpp b/flang/lib/Semantics/check-omp-structure.cpp
index cc2dd0a..db030bb 100644
--- a/flang/lib/Semantics/check-omp-structure.cpp
+++ b/flang/lib/Semantics/check-omp-structure.cpp
@@ -3106,6 +3106,12 @@ CHECK_REQ_CONSTANT_SCALAR_INT_CLAUSE(Collapse, OMPC_collapse)
 CHECK_REQ_CONSTANT_SCALAR_INT_CLAUSE(Safelen, OMPC_safelen)
 CHECK_REQ_CONSTANT_SCALAR_INT_CLAUSE(Simdlen, OMPC_simdlen)
 
+void OmpStructureChecker::Enter(const parser::OmpClause::Looprange &x) {
+  context_.Say(GetContext().clauseSource,
+      "LOOPRANGE clause is not implemented yet"_err_en_US,
+      ContextDirectiveAsFortran());
+}
+
 // Restrictions specific to each clause are implemented apart from the
 // generalized restrictions.
 
diff --git a/flang/lib/Semantics/resolve-directives.cpp b/flang/lib/Semantics/resolve-directives.cpp
index a4c8922f..270642a 100644
--- a/flang/lib/Semantics/resolve-directives.cpp
+++ b/flang/lib/Semantics/resolve-directives.cpp
@@ -362,6 +362,24 @@ public:
   explicit OmpAttributeVisitor(SemanticsContext &context)
       : DirectiveAttributeVisitor(context) {}
 
+  static const Scope &scopingUnit(const Scope &scope) {
+    const Scope *iter{&scope};
+    for (; !iter->IsTopLevel(); iter = &iter->parent()) {
+      switch (iter->kind()) {
+      case Scope::Kind::BlockConstruct:
+      case Scope::Kind::BlockData:
+      case Scope::Kind::DerivedType:
+      case Scope::Kind::MainProgram:
+      case Scope::Kind::Module:
+      case Scope::Kind::Subprogram:
+        return *iter;
+      default:
+        break;
+      }
+    }
+    return *iter;
+  }
+
   template <typename A> void Walk(const A &x) { parser::Walk(x, *this); }
   template <typename A> bool Pre(const A &) { return true; }
   template <typename A> void Post(const A &) {}
@@ -952,7 +970,6 @@ private:
   void ResolveOmpNameList(const std::list<parser::Name> &, Symbol::Flag);
   void ResolveOmpName(const parser::Name &, Symbol::Flag);
   Symbol *ResolveName(const parser::Name *);
-  Symbol *ResolveOmpObjectScope(const parser::Name *);
   Symbol *DeclareOrMarkOtherAccessEntity(const parser::Name &, Symbol::Flag);
   Symbol *DeclareOrMarkOtherAccessEntity(Symbol &, Symbol::Flag);
   void CheckMultipleAppearances(
@@ -2920,31 +2937,6 @@ Symbol *OmpAttributeVisitor::ResolveOmpCommonBlockName(
   return nullptr;
 }
 
-// Use this function over ResolveOmpName when an omp object's scope needs
-// resolving, it's symbol flag isn't important and a simple check for resolution
-// failure is desired. Using ResolveOmpName means needing to work with the
-// context to check for failure, whereas here a pointer comparison is all that's
-// needed.
-Symbol *OmpAttributeVisitor::ResolveOmpObjectScope(const parser::Name *name) {
-
-  // TODO: Investigate whether the following block can be replaced by, or
-  // included in, the ResolveOmpName function
-  if (auto *prev{name ? GetContext().scope.parent().FindSymbol(name->source)
-                      : nullptr}) {
-    name->symbol = prev;
-    return nullptr;
-  }
-
-  // TODO: Investigate whether the following block can be replaced by, or
-  // included in, the ResolveOmpName function
-  if (auto *ompSymbol{
-          name ? GetContext().scope.FindSymbol(name->source) : nullptr}) {
-    name->symbol = ompSymbol;
-    return ompSymbol;
-  }
-  return nullptr;
-}
-
 void OmpAttributeVisitor::ResolveOmpObjectList(
     const parser::OmpObjectList &ompObjectList, Symbol::Flag ompFlag) {
   for (const auto &ompObject : ompObjectList.v) {
@@ -3023,13 +3015,19 @@ void OmpAttributeVisitor::ResolveOmpDesignator(
       context_.Say(designator.source,
           "List items specified in the ALLOCATE directive must not have the ALLOCATABLE attribute unless the directive is associated with an ALLOCATE statement"_err_en_US);
     }
-    if ((ompFlag == Symbol::Flag::OmpDeclarativeAllocateDirective ||
-            ompFlag == Symbol::Flag::OmpExecutableAllocateDirective) &&
-        ResolveOmpObjectScope(name) == nullptr) {
-      context_.Say(designator.source, // 2.15.3
-          "List items must be declared in the same scoping unit in which the %s directive appears"_err_en_US,
-          parser::ToUpperCaseLetters(
-              llvm::omp::getOpenMPDirectiveName(directive, version)));
+    bool checkScope{ompFlag == Symbol::Flag::OmpDeclarativeAllocateDirective};
+    // In 5.1 the scope check only applies to declarative allocate.
+    if (version == 50 && !checkScope) {
+      checkScope = ompFlag == Symbol::Flag::OmpExecutableAllocateDirective;
+    }
+    if (checkScope) {
+      if (scopingUnit(GetContext().scope) !=
+          scopingUnit(symbol->GetUltimate().owner())) {
+        context_.Say(designator.source, // 2.15.3
+            "List items must be declared in the same scoping unit in which the %s directive appears"_err_en_US,
+            parser::ToUpperCaseLetters(
+                llvm::omp::getOpenMPDirectiveName(directive, version)));
+      }
     }
     if (ompFlag == Symbol::Flag::OmpReduction) {
       // Using variables inside of a namelist in OpenMP reductions
diff --git a/flang/lib/Semantics/resolve-names.cpp b/flang/lib/Semantics/resolve-names.cpp
index 2f350f0..ef0b8cd 100644
--- a/flang/lib/Semantics/resolve-names.cpp
+++ b/flang/lib/Semantics/resolve-names.cpp
@@ -1618,12 +1618,14 @@ public:
   void Post(const parser::OpenMPDeclareTargetConstruct &) {
     SkipImplicitTyping(false);
   }
-  bool Pre(const parser::OpenMPDeclarativeAllocate &) {
+  bool Pre(const parser::OpenMPDeclarativeAllocate &x) {
+    AddOmpSourceRange(x.source);
     SkipImplicitTyping(true);
     return true;
   }
   void Post(const parser::OpenMPDeclarativeAllocate &) {
     SkipImplicitTyping(false);
+    messageHandler().set_currStmtSource(std::nullopt);
   }
   bool Pre(const parser::OpenMPDeclarativeConstruct &x) {
     AddOmpSourceRange(x.source);
diff --git a/flang/test/HLFIR/simplify-hlfir-intrinsics-index.fir b/flang/test/HLFIR/simplify-hlfir-intrinsics-index.fir
new file mode 100644
index 0000000..258a1d8
--- /dev/null
+++ b/flang/test/HLFIR/simplify-hlfir-intrinsics-index.fir
@@ -0,0 +1,345 @@
+// RUN: fir-opt %s --simplify-hlfir-intrinsics | FileCheck %s
+
+// Simplify should reduce hlfir.index to constant (5)
+func.func @_QPt1() {
+// CHECK-LABEL:   func.func @_QPt1() {
+// CHECK:           %[[VAL_0:.*]] = arith.constant 5 : index
+// CHECK:           %[[VAL_1:.*]] = arith.constant 0 : index
+// CHECK:           %[[VAL_2:.*]] = arith.constant 3 : index
+// CHECK:           %[[VAL_3:.*]] = arith.constant 4 : index
+// CHECK:           %[[VAL_4:.*]] = fir.dummy_scope : !fir.dscope
+// CHECK:           %[[VAL_5:.*]] = fir.alloca i32 {bindc_name = "n", uniq_name = "_QFt1En"}
+// CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_5]] {uniq_name = "_QFt1En"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+// CHECK:           %[[VAL_7:.*]] = fir.alloca !fir.char<1,4> {bindc_name = "s", uniq_name = "_QFt1Es"}
+// CHECK:           %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_7]] typeparams %[[VAL_3]] {uniq_name = "_QFt1Es"} : (!fir.ref<!fir.char<1,4>>, index) -> (!fir.ref<!fir.char<1,4>>, !fir.ref<!fir.char<1,4>>)
+// CHECK:           %[[VAL_9:.*]] = fir.address_of(@_QQclX616263) : !fir.ref<!fir.char<1,3>>
+// CHECK:           %[[VAL_10:.*]]:2 = hlfir.declare %[[VAL_9]] typeparams %[[VAL_2]] {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QQclX616263"} : (!fir.ref<!fir.char<1,3>>, index) -> (!fir.ref<!fir.char<1,3>>, !fir.ref<!fir.char<1,3>>)
+// CHECK:           hlfir.assign %[[VAL_10]]#0 to %[[VAL_8]]#0 : !fir.ref<!fir.char<1,3>>, !fir.ref<!fir.char<1,4>>
+// CHECK:           %[[VAL_11:.*]] = fir.address_of(@_QQclX) : !fir.ref<!fir.char<1,0>>
+// CHECK:           %[[VAL_12:.*]]:2 = hlfir.declare %[[VAL_11]] typeparams %[[VAL_1]] {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QQclX"} : (!fir.ref<!fir.char<1,0>>, index) -> (!fir.ref<!fir.char<1,0>>, !fir.ref<!fir.char<1,0>>)
+// CHECK:           %[[VAL_13:.*]] = fir.convert %[[VAL_0]] : (index) -> i32
+// CHECK:           hlfir.assign %[[VAL_13]] to %[[VAL_6]]#0 : i32, !fir.ref<i32>
+// CHECK:           return
+// CHECK:         }
+    %0 = fir.dummy_scope : !fir.dscope
+    %1 = fir.alloca i32 {bindc_name = "n", uniq_name = "_QFt1En"}
+    %2:2 = hlfir.declare %1 {uniq_name = "_QFt1En"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+    %c4 = arith.constant 4 : index
+    %3 = fir.alloca !fir.char<1,4> {bindc_name = "s", uniq_name = "_QFt1Es"}
+    %4:2 = hlfir.declare %3 typeparams %c4 {uniq_name = "_QFt1Es"} : (!fir.ref<!fir.char<1,4>>, index) -> (!fir.ref<!fir.char<1,4>>, !fir.ref<!fir.char<1,4>>)
+    %5 = fir.address_of(@_QQclX616263) : !fir.ref<!fir.char<1,3>>
+    %c3 = arith.constant 3 : index
+    %6:2 = hlfir.declare %5 typeparams %c3 {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QQclX616263"} : (!fir.ref<!fir.char<1,3>>, index) -> (!fir.ref<!fir.char<1,3>>, !fir.ref<!fir.char<1,3>>)
+    hlfir.assign %6#0 to %4#0 : !fir.ref<!fir.char<1,3>>, !fir.ref<!fir.char<1,4>>
+    %7 = fir.address_of(@_QQclX) : !fir.ref<!fir.char<1,0>>
+    %c0 = arith.constant 0 : index
+    %8:2 = hlfir.declare %7 typeparams %c0 {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QQclX"} : (!fir.ref<!fir.char<1,0>>, index) -> (!fir.ref<!fir.char<1,0>>, !fir.ref<!fir.char<1,0>>)
+    %true = arith.constant true
+    %9 = hlfir.index %8#0 in %4#0 back %true : (!fir.ref<!fir.char<1,0>>, !fir.ref<!fir.char<1,4>>, i1) -> i32
+    hlfir.assign %9 to %2#0 : i32, !fir.ref<i32>
+    return
+}
+
+// ! 'back' is unknown at compile time, substring is zero length - generate select (back ? strlen+1 : 1)
+func.func @_QPt2(%arg0: !fir.boxchar<2> {fir.bindc_name = "s"}, %arg1: !fir.ref<!fir.logical<4>> {fir.bindc_name = "b"}) {
+// CHECK-LABEL:   func.func @_QPt2(
+// CHECK-SAME:                     %[[ARG0:.*]]: !fir.boxchar<2> {fir.bindc_name = "s"},
+// CHECK-SAME:                     %[[ARG1:.*]]: !fir.ref<!fir.logical<4>> {fir.bindc_name = "b"}) {
+// CHECK:           %[[VAL_0:.*]] = arith.constant 1 : index
+// CHECK:           %[[VAL_1:.*]] = arith.constant 0 : index
+// CHECK:           %[[VAL_2:.*]] = fir.dummy_scope : !fir.dscope
+// CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %[[VAL_2]] {uniq_name = "_QFt2Eb"} : (!fir.ref<!fir.logical<4>>, !fir.dscope) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+// CHECK:           %[[VAL_4:.*]] = fir.alloca i32 {bindc_name = "n", uniq_name = "_QFt2En"}
+// CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_4]] {uniq_name = "_QFt2En"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+// CHECK:           %[[VAL_6:.*]]:2 = fir.unboxchar %[[ARG0]] : (!fir.boxchar<2>) -> (!fir.ref<!fir.char<2,?>>, index)
+// CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_6]]#0 typeparams %[[VAL_6]]#1 dummy_scope %[[VAL_2]] {uniq_name = "_QFt2Es"} : (!fir.ref<!fir.char<2,?>>, index, !fir.dscope) -> (!fir.boxchar<2>, !fir.ref<!fir.char<2,?>>)
+// CHECK:           %[[VAL_8:.*]] = fir.address_of(@_QQcl2X) : !fir.ref<!fir.char<2,0>>
+// CHECK:           %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_8]] typeparams %[[VAL_1]] {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QQcl2X"} : (!fir.ref<!fir.char<2,0>>, index) -> (!fir.ref<!fir.char<2,0>>, !fir.ref<!fir.char<2,0>>)
+// CHECK:           %[[VAL_10:.*]] = fir.load %[[VAL_3]]#0 : !fir.ref<!fir.logical<4>>
+// CHECK:           %[[VAL_11:.*]] = arith.addi %[[VAL_6]]#1, %[[VAL_0]] : index
+// CHECK:           %[[VAL_12:.*]] = fir.convert %[[VAL_10]] : (!fir.logical<4>) -> i1
+// CHECK:           %[[VAL_13:.*]] = arith.select %[[VAL_12]], %[[VAL_11]], %[[VAL_0]] : index
+// CHECK:           %[[VAL_14:.*]] = fir.convert %[[VAL_13]] : (index) -> i32
+// CHECK:           hlfir.assign %[[VAL_14]] to %[[VAL_5]]#0 : i32, !fir.ref<i32>
+// CHECK:           return
+// CHECK:         }
+    %0 = fir.dummy_scope : !fir.dscope
+    %1:2 = hlfir.declare %arg1 dummy_scope %0 {uniq_name = "_QFt2Eb"} : (!fir.ref<!fir.logical<4>>, !fir.dscope) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+    %2 = fir.alloca i32 {bindc_name = "n", uniq_name = "_QFt2En"}
+    %3:2 = hlfir.declare %2 {uniq_name = "_QFt2En"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+    %4:2 = fir.unboxchar %arg0 : (!fir.boxchar<2>) -> (!fir.ref<!fir.char<2,?>>, index)
+    %5:2 = hlfir.declare %4#0 typeparams %4#1 dummy_scope %0 {uniq_name = "_QFt2Es"} : (!fir.ref<!fir.char<2,?>>, index, !fir.dscope) -> (!fir.boxchar<2>, !fir.ref<!fir.char<2,?>>)
+    %6 = fir.address_of(@_QQcl2X) : !fir.ref<!fir.char<2,0>>
+    %c0 = arith.constant 0 : index
+    %7:2 = hlfir.declare %6 typeparams %c0 {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QQcl2X"} : (!fir.ref<!fir.char<2,0>>, index) -> (!fir.ref<!fir.char<2,0>>, !fir.ref<!fir.char<2,0>>)
+    %8 = fir.load %1#0 : !fir.ref<!fir.logical<4>>
+    %9 = hlfir.index %7#0 in %5#0 back %8 : (!fir.ref<!fir.char<2,0>>, !fir.boxchar<2>, !fir.logical<4>) -> i32
+    hlfir.assign %9 to %3#0 : i32, !fir.ref<i32>
+    return
+}
+
+// inline as search loop (backward)
+func.func @_QPt3(%arg0: !fir.boxchar<2> {fir.bindc_name = "s"}) {
+// CHECK-LABEL:   func.func @_QPt3(
+// CHECK-SAME:                     %[[ARG0:.*]]: !fir.boxchar<2> {fir.bindc_name = "s"}) {
+// CHECK:           %[[VAL_0:.*]] = arith.constant 0 : index
+// CHECK:           %[[VAL_1:.*]] = arith.constant 1 : index
+// CHECK:           %[[VAL_2:.*]] = fir.dummy_scope : !fir.dscope
+// CHECK:           %[[VAL_3:.*]] = fir.alloca i32 {bindc_name = "n", uniq_name = "_QFt3En"}
+// CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFt3En"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+// CHECK:           %[[VAL_5:.*]]:2 = fir.unboxchar %[[ARG0]] : (!fir.boxchar<2>) -> (!fir.ref<!fir.char<2,?>>, index)
+// CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_5]]#0 typeparams %[[VAL_5]]#1 dummy_scope %[[VAL_2]] {uniq_name = "_QFt3Es"} : (!fir.ref<!fir.char<2,?>>, index, !fir.dscope) -> (!fir.boxchar<2>, !fir.ref<!fir.char<2,?>>)
+// CHECK:           %[[VAL_7:.*]] = fir.address_of(@_QQcl2X6500) : !fir.ref<!fir.char<2>>
+// CHECK:           %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_7]] typeparams %[[VAL_1]] {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QQcl2X6500"} : (!fir.ref<!fir.char<2>>, index) -> (!fir.ref<!fir.char<2>>, !fir.ref<!fir.char<2>>)
+// CHECK:           %[[VAL_9:.*]] = hlfir.designate %[[VAL_8]]#0  substr %[[VAL_1]], %[[VAL_1]]  typeparams %[[VAL_1]] : (!fir.ref<!fir.char<2>>, index, index, index) -> !fir.ref<!fir.char<2>>
+// CHECK:           %[[VAL_10:.*]] = fir.load %[[VAL_9]] : !fir.ref<!fir.char<2>>
+// CHECK:           %[[VAL_11:.*]] = fir.extract_value %[[VAL_10]], [0 : index] : (!fir.char<2>) -> i16
+// CHECK:           %[[VAL_12:.*]] = arith.addi %[[VAL_5]]#1, %[[VAL_1]] : index
+// CHECK:           %[[VAL_13:.*]] = fir.do_loop %[[VAL_14:.*]] = %[[VAL_1]] to %[[VAL_5]]#1 step %[[VAL_1]] iter_args(%[[VAL_15:.*]] = %[[VAL_0]]) -> (index) {
+// CHECK:             %[[VAL_16:.*]] = arith.cmpi eq, %[[VAL_15]], %[[VAL_0]] : index
+// CHECK:             %[[VAL_17:.*]] = fir.if %[[VAL_16]] -> (index) {
+// CHECK:               %[[VAL_18:.*]] = arith.subi %[[VAL_12]], %[[VAL_14]] : index
+// CHECK:               %[[VAL_19:.*]] = hlfir.designate %[[VAL_6]]#0  substr %[[VAL_18]], %[[VAL_18]]  typeparams %[[VAL_1]] : (!fir.boxchar<2>, index, index, index) -> !fir.ref<!fir.char<2>>
+// CHECK:               %[[VAL_20:.*]] = fir.load %[[VAL_19]] : !fir.ref<!fir.char<2>>
+// CHECK:               %[[VAL_21:.*]] = fir.extract_value %[[VAL_20]], [0 : index] : (!fir.char<2>) -> i16
+// CHECK:               %[[VAL_22:.*]] = arith.cmpi eq, %[[VAL_21]], %[[VAL_11]] : i16
+// CHECK:               %[[VAL_23:.*]] = arith.select %[[VAL_22]], %[[VAL_18]], %[[VAL_15]] : index
+// CHECK:               fir.result %[[VAL_23]] : index
+// CHECK:             } else {
+// CHECK:               fir.result %[[VAL_15]] : index
+// CHECK:             }
+// CHECK:             fir.result %[[VAL_17]] : index
+// CHECK:           }
+// CHECK:           %[[VAL_24:.*]] = fir.convert %[[VAL_13]] : (index) -> i32
+// CHECK:           hlfir.assign %[[VAL_24]] to %[[VAL_4]]#0 : i32, !fir.ref<i32>
+// CHECK:           return
+// CHECK:         }
+    %0 = fir.dummy_scope : !fir.dscope
+    %1 = fir.alloca i32 {bindc_name = "n", uniq_name = "_QFt3En"}
+    %2:2 = hlfir.declare %1 {uniq_name = "_QFt3En"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+    %3:2 = fir.unboxchar %arg0 : (!fir.boxchar<2>) -> (!fir.ref<!fir.char<2,?>>, index)
+    %4:2 = hlfir.declare %3#0 typeparams %3#1 dummy_scope %0 {uniq_name = "_QFt3Es"} : (!fir.ref<!fir.char<2,?>>, index, !fir.dscope) -> (!fir.boxchar<2>, !fir.ref<!fir.char<2,?>>)
+    %5 = fir.address_of(@_QQcl2X6500) : !fir.ref<!fir.char<2>>
+    %c1 = arith.constant 1 : index
+    %6:2 = hlfir.declare %5 typeparams %c1 {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QQcl2X6500"} : (!fir.ref<!fir.char<2>>, index) -> (!fir.ref<!fir.char<2>>, !fir.ref<!fir.char<2>>)
+    %true = arith.constant true
+    %7 = hlfir.index %6#0 in %4#0 back %true : (!fir.ref<!fir.char<2>>, !fir.boxchar<2>, i1) -> i32
+    hlfir.assign %7 to %2#0 : i32, !fir.ref<i32>
+    return
+}
+
+//inline as search loop (forward)
+func.func @_QPt4(%arg0: !fir.boxchar<2> {fir.bindc_name = "s"}) {
+// CHECK-LABEL:   func.func @_QPt4(
+// CHECK-SAME:                     %[[ARG0:.*]]: !fir.boxchar<2> {fir.bindc_name = "s"}) {
+// CHECK:           %[[VAL_0:.*]] = arith.constant 0 : index
+// CHECK:           %[[VAL_1:.*]] = arith.constant 1 : index
+// CHECK:           %[[VAL_2:.*]] = fir.dummy_scope : !fir.dscope
+// CHECK:           %[[VAL_3:.*]] = fir.alloca i32 {bindc_name = "n", uniq_name = "_QFt4En"}
+// CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFt4En"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+// CHECK:           %[[VAL_5:.*]]:2 = fir.unboxchar %[[ARG0]] : (!fir.boxchar<2>) -> (!fir.ref<!fir.char<2,?>>, index)
+// CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_5]]#0 typeparams %[[VAL_5]]#1 dummy_scope %[[VAL_2]] {uniq_name = "_QFt4Es"} : (!fir.ref<!fir.char<2,?>>, index, !fir.dscope) -> (!fir.boxchar<2>, !fir.ref<!fir.char<2,?>>)
+// CHECK:           %[[VAL_7:.*]] = fir.address_of(@_QQcl2X6500) : !fir.ref<!fir.char<2>>
+// CHECK:           %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_7]] typeparams %[[VAL_1]] {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QQcl2X6500"} : (!fir.ref<!fir.char<2>>, index) -> (!fir.ref<!fir.char<2>>, !fir.ref<!fir.char<2>>)
+// CHECK:           %[[VAL_9:.*]] = hlfir.designate %[[VAL_8]]#0  substr %[[VAL_1]], %[[VAL_1]]  typeparams %[[VAL_1]] : (!fir.ref<!fir.char<2>>, index, index, index) -> !fir.ref<!fir.char<2>>
+// CHECK:           %[[VAL_10:.*]] = fir.load %[[VAL_9]] : !fir.ref<!fir.char<2>>
+// CHECK:           %[[VAL_11:.*]] = fir.extract_value %[[VAL_10]], [0 : index] : (!fir.char<2>) -> i16
+// CHECK:           %[[VAL_12:.*]] = fir.do_loop %[[VAL_13:.*]] = %[[VAL_1]] to %[[VAL_5]]#1 step %[[VAL_1]] iter_args(%[[VAL_14:.*]] = %[[VAL_0]]) -> (index) {
+// CHECK:             %[[VAL_15:.*]] = arith.cmpi eq, %[[VAL_14]], %[[VAL_0]] : index
+// CHECK:             %[[VAL_16:.*]] = fir.if %[[VAL_15]] -> (index) {
+// CHECK:               %[[VAL_17:.*]] = hlfir.designate %[[VAL_6]]#0  substr %[[VAL_13]], %[[VAL_13]]  typeparams %[[VAL_1]] : (!fir.boxchar<2>, index, index, index) -> !fir.ref<!fir.char<2>>
+// CHECK:               %[[VAL_18:.*]] = fir.load %[[VAL_17]] : !fir.ref<!fir.char<2>>
+// CHECK:               %[[VAL_19:.*]] = fir.extract_value %[[VAL_18]], [0 : index] : (!fir.char<2>) -> i16
+// CHECK:               %[[VAL_20:.*]] = arith.cmpi eq, %[[VAL_19]], %[[VAL_11]] : i16
+// CHECK:               %[[VAL_21:.*]] = arith.select %[[VAL_20]], %[[VAL_13]], %[[VAL_14]] : index
+// CHECK:               fir.result %[[VAL_21]] : index
+// CHECK:             } else {
+// CHECK:               fir.result %[[VAL_14]] : index
+// CHECK:             }
+// CHECK:             fir.result %[[VAL_16]] : index
+// CHECK:           }
+// CHECK:           %[[VAL_22:.*]] = fir.convert %[[VAL_12]] : (index) -> i32
+// CHECK:           hlfir.assign %[[VAL_22]] to %[[VAL_4]]#0 : i32, !fir.ref<i32>
+// CHECK:           return
+// CHECK:         }
+    %0 = fir.dummy_scope : !fir.dscope
+    %1 = fir.alloca i32 {bindc_name = "n", uniq_name = "_QFt4En"}
+    %2:2 = hlfir.declare %1 {uniq_name = "_QFt4En"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+    %3:2 = fir.unboxchar %arg0 : (!fir.boxchar<2>) -> (!fir.ref<!fir.char<2,?>>, index)
+    %4:2 = hlfir.declare %3#0 typeparams %3#1 dummy_scope %0 {uniq_name = "_QFt4Es"} : (!fir.ref<!fir.char<2,?>>, index, !fir.dscope) -> (!fir.boxchar<2>, !fir.ref<!fir.char<2,?>>)
+    %5 = fir.address_of(@_QQcl2X6500) : !fir.ref<!fir.char<2>>
+    %c1 = arith.constant 1 : index
+    %6:2 = hlfir.declare %5 typeparams %c1 {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QQcl2X6500"} : (!fir.ref<!fir.char<2>>, index) -> (!fir.ref<!fir.char<2>>, !fir.ref<!fir.char<2>>)
+    %false = arith.constant false
+    %7 = hlfir.index %6#0 in %4#0 back %false : (!fir.ref<!fir.char<2>>, !fir.boxchar<2>, i1) -> i32
+    hlfir.assign %7 to %2#0 : i32, !fir.ref<i32>
+    return
+}
+
+// Same as t4 above but result kind=1
+func.func @_QPt5(%arg0: !fir.boxchar<2> {fir.bindc_name = "s"}) {
+// CHECK-LABEL:   func.func @_QPt5(
+// CHECK-SAME:                     %[[ARG0:.*]]: !fir.boxchar<2> {fir.bindc_name = "s"}) {
+// CHECK:           %[[VAL_0:.*]] = arith.constant 0 : index
+// CHECK:           %[[VAL_1:.*]] = arith.constant 1 : index
+// CHECK:           %[[VAL_2:.*]] = fir.dummy_scope : !fir.dscope
+// CHECK:           %[[VAL_3:.*]] = fir.alloca i32 {bindc_name = "n", uniq_name = "_QFt5En"}
+// CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFt5En"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+// CHECK:           %[[VAL_5:.*]]:2 = fir.unboxchar %[[ARG0]] : (!fir.boxchar<2>) -> (!fir.ref<!fir.char<2,?>>, index)
+// CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_5]]#0 typeparams %[[VAL_5]]#1 dummy_scope %[[VAL_2]] {uniq_name = "_QFt5Es"} : (!fir.ref<!fir.char<2,?>>, index, !fir.dscope) -> (!fir.boxchar<2>, !fir.ref<!fir.char<2,?>>)
+// CHECK:           %[[VAL_7:.*]] = fir.address_of(@_QQcl2X6500) : !fir.ref<!fir.char<2>>
+// CHECK:           %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_7]] typeparams %[[VAL_1]] {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QQcl2X6500"} : (!fir.ref<!fir.char<2>>, index) -> (!fir.ref<!fir.char<2>>, !fir.ref<!fir.char<2>>)
+// CHECK:           %[[VAL_9:.*]] = hlfir.designate %[[VAL_8]]#0  substr %[[VAL_1]], %[[VAL_1]]  typeparams %[[VAL_1]] : (!fir.ref<!fir.char<2>>, index, index, index) -> !fir.ref<!fir.char<2>>
+// CHECK:           %[[VAL_10:.*]] = fir.load %[[VAL_9]] : !fir.ref<!fir.char<2>>
+// CHECK:           %[[VAL_11:.*]] = fir.extract_value %[[VAL_10]], [0 : index] : (!fir.char<2>) -> i16
+// CHECK:           %[[VAL_12:.*]] = fir.do_loop %[[VAL_13:.*]] = %[[VAL_1]] to %[[VAL_5]]#1 step %[[VAL_1]] iter_args(%[[VAL_14:.*]] = %[[VAL_0]]) -> (index) {
+// CHECK:             %[[VAL_15:.*]] = arith.cmpi eq, %[[VAL_14]], %[[VAL_0]] : index
+// CHECK:             %[[VAL_16:.*]] = fir.if %[[VAL_15]] -> (index) {
+// CHECK:               %[[VAL_17:.*]] = hlfir.designate %[[VAL_6]]#0  substr %[[VAL_13]], %[[VAL_13]]  typeparams %[[VAL_1]] : (!fir.boxchar<2>, index, index, index) -> !fir.ref<!fir.char<2>>
+// CHECK:               %[[VAL_18:.*]] = fir.load %[[VAL_17]] : !fir.ref<!fir.char<2>>
+// CHECK:               %[[VAL_19:.*]] = fir.extract_value %[[VAL_18]], [0 : index] : (!fir.char<2>) -> i16
+// CHECK:               %[[VAL_20:.*]] = arith.cmpi eq, %[[VAL_19]], %[[VAL_11]] : i16
+// CHECK:               %[[VAL_21:.*]] = arith.select %[[VAL_20]], %[[VAL_13]], %[[VAL_14]] : index
+// CHECK:               fir.result %[[VAL_21]] : index
+// CHECK:             } else {
+// CHECK:               fir.result %[[VAL_14]] : index
+// CHECK:             }
+// CHECK:             fir.result %[[VAL_16]] : index
+// CHECK:           }
+// CHECK:           %[[VAL_22:.*]] = fir.convert %[[VAL_12]] : (index) -> i8
+// CHECK:           %[[VAL_23:.*]] = fir.convert %[[VAL_22]] : (i8) -> i32
+// CHECK:           hlfir.assign %[[VAL_23]] to %[[VAL_4]]#0 : i32, !fir.ref<i32>
+// CHECK:           return
+// CHECK:         }
+    %0 = fir.dummy_scope : !fir.dscope
+    %1 = fir.alloca i32 {bindc_name = "n", uniq_name = "_QFt5En"}
+    %2:2 = hlfir.declare %1 {uniq_name = "_QFt5En"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+    %3:2 = fir.unboxchar %arg0 : (!fir.boxchar<2>) -> (!fir.ref<!fir.char<2,?>>, index)
+    %4:2 = hlfir.declare %3#0 typeparams %3#1 dummy_scope %0 {uniq_name = "_QFt5Es"} : (!fir.ref<!fir.char<2,?>>, index, !fir.dscope) -> (!fir.boxchar<2>, !fir.ref<!fir.char<2,?>>)
+    %5 = fir.address_of(@_QQcl2X6500) : !fir.ref<!fir.char<2>>
+    %c1 = arith.constant 1 : index
+    %6:2 = hlfir.declare %5 typeparams %c1 {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QQcl2X6500"} : (!fir.ref<!fir.char<2>>, index) -> (!fir.ref<!fir.char<2>>, !fir.ref<!fir.char<2>>)
+    %false = arith.constant false
+    %7 = hlfir.index %6#0 in %4#0 back %false : (!fir.ref<!fir.char<2>>, !fir.boxchar<2>, i1) -> i8
+    %8 = fir.convert %7 : (i8) -> i32
+    hlfir.assign %8 to %2#0 : i32, !fir.ref<i32>
+    return
+  }
+
+// Do no simplify - runtime call for forward search with character kind=1 is faster
+func.func @_QPt6(%arg0: !fir.boxchar<1> {fir.bindc_name = "s"}) {
+// CHECK-LABEL:   func.func @_QPt6(
+// CHECK-SAME:                     %[[ARG0:.*]]: !fir.boxchar<1> {fir.bindc_name = "s"}) {
+// CHECK:           %[[VAL_0:.*]] = arith.constant false
+// CHECK:           %[[VAL_1:.*]] = arith.constant 1 : index
+// CHECK:           %[[VAL_2:.*]] = fir.dummy_scope : !fir.dscope
+// CHECK:           %[[VAL_3:.*]] = fir.alloca i32 {bindc_name = "n", uniq_name = "_QFt6En"}
+// CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFt6En"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+// CHECK:           %[[VAL_5:.*]]:2 = fir.unboxchar %[[ARG0]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
+// CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_5]]#0 typeparams %[[VAL_5]]#1 dummy_scope %[[VAL_2]] {uniq_name = "_QFt6Es"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
+// CHECK:           %[[VAL_7:.*]] = fir.address_of(@_QQclX65) : !fir.ref<!fir.char<1>>
+// CHECK:           %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_7]] typeparams %[[VAL_1]] {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QQclX65"} : (!fir.ref<!fir.char<1>>, index) -> (!fir.ref<!fir.char<1>>, !fir.ref<!fir.char<1>>)
+// CHECK:           %[[VAL_9:.*]] = hlfir.index %[[VAL_8]]#0 in %[[VAL_6]]#0 back %[[VAL_0]] : (!fir.ref<!fir.char<1>>, !fir.boxchar<1>, i1) -> i32
+// CHECK:           hlfir.assign %[[VAL_9]] to %[[VAL_4]]#0 : i32, !fir.ref<i32>
+// CHECK:           return
+// CHECK:         }
+    %0 = fir.dummy_scope : !fir.dscope
+    %1 = fir.alloca i32 {bindc_name = "n", uniq_name = "_QFt6En"}
+    %2:2 = hlfir.declare %1 {uniq_name = "_QFt6En"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+    %3:2 = fir.unboxchar %arg0 : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
+    %4:2 = hlfir.declare %3#0 typeparams %3#1 dummy_scope %0 {uniq_name = "_QFt6Es"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
+    %5 = fir.address_of(@_QQclX65) : !fir.ref<!fir.char<1>>
+    %c1 = arith.constant 1 : index
+    %6:2 = hlfir.declare %5 typeparams %c1 {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QQclX65"} : (!fir.ref<!fir.char<1>>, index) -> (!fir.ref<!fir.char<1>>, !fir.ref<!fir.char<1>>)
+    %false = arith.constant false
+    %7 = hlfir.index %6#0 in %4#0 back %false : (!fir.ref<!fir.char<1>>, !fir.boxchar<1>, i1) -> i32
+    hlfir.assign %7 to %2#0 : i32, !fir.ref<i32>
+    return
+}
+
+// Do not simplify - runtime call for forward search with character kind=1 is faster
+// Lookup direction is unknown at compile time, hence forward is pessimistically assumed
+func.func @_QPt7(%arg0: !fir.boxchar<1> {fir.bindc_name = "s"}, %arg1: !fir.ref<!fir.logical<4>> {fir.bindc_name = "b"}) {
+// CHECK-LABEL:   func.func @_QPt7(
+// CHECK-SAME:                     %[[ARG0:.*]]: !fir.boxchar<1> {fir.bindc_name = "s"},
+// CHECK-SAME:                     %[[ARG1:.*]]: !fir.ref<!fir.logical<4>> {fir.bindc_name = "b"}) {
+// CHECK:           %[[VAL_0:.*]] = arith.constant 1 : index
+// CHECK:           %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope
+// CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %[[VAL_1]] {uniq_name = "_QFt7Eb"} : (!fir.ref<!fir.logical<4>>, !fir.dscope) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+// CHECK:           %[[VAL_3:.*]] = fir.alloca i32 {bindc_name = "n", uniq_name = "_QFt7En"}
+// CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFt7En"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+// CHECK:           %[[VAL_5:.*]]:2 = fir.unboxchar %[[ARG0]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
+// CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_5]]#0 typeparams %[[VAL_5]]#1 dummy_scope %[[VAL_1]] {uniq_name = "_QFt7Es"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
+// CHECK:           %[[VAL_7:.*]] = fir.address_of(@_QQclX65) : !fir.ref<!fir.char<1>>
+// CHECK:           %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_7]] typeparams %[[VAL_0]] {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QQclX65"} : (!fir.ref<!fir.char<1>>, index) -> (!fir.ref<!fir.char<1>>, !fir.ref<!fir.char<1>>)
+// CHECK:           %[[VAL_9:.*]] = fir.load %[[VAL_2]]#0 : !fir.ref<!fir.logical<4>>
+// CHECK:           %[[VAL_10:.*]] = hlfir.index %[[VAL_8]]#0 in %[[VAL_6]]#0 back %[[VAL_9]] : (!fir.ref<!fir.char<1>>, !fir.boxchar<1>, !fir.logical<4>) -> i32
+// CHECK:           hlfir.assign %[[VAL_10]] to %[[VAL_4]]#0 : i32, !fir.ref<i32>
+// CHECK:           return
+// CHECK:         }
+    %0 = fir.dummy_scope : !fir.dscope
+    %1:2 = hlfir.declare %arg1 dummy_scope %0 {uniq_name = "_QFt7Eb"} : (!fir.ref<!fir.logical<4>>, !fir.dscope) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+    %2 = fir.alloca i32 {bindc_name = "n", uniq_name = "_QFt7En"}
+    %3:2 = hlfir.declare %2 {uniq_name = "_QFt7En"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+    %4:2 = fir.unboxchar %arg0 : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
+    %5:2 = hlfir.declare %4#0 typeparams %4#1 dummy_scope %0 {uniq_name = "_QFt7Es"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
+    %6 = fir.address_of(@_QQclX65) : !fir.ref<!fir.char<1>>
+    %c1 = arith.constant 1 : index
+    %7:2 = hlfir.declare %6 typeparams %c1 {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QQclX65"} : (!fir.ref<!fir.char<1>>, index) -> (!fir.ref<!fir.char<1>>, !fir.ref<!fir.char<1>>)
+    %8 = fir.load %1#0 : !fir.ref<!fir.logical<4>>
+    %9 = hlfir.index %7#0 in %5#0 back %8 : (!fir.ref<!fir.char<1>>, !fir.boxchar<1>, !fir.logical<4>) -> i32
+    hlfir.assign %9 to %3#0 : i32, !fir.ref<i32>
+    return
+}
+
+// Inline as backward search loop for character kind=1.
+// The case similar to t7 but direction is known, so it is faster than runtime call.
+func.func @_QPt8(%arg0: !fir.boxchar<1> {fir.bindc_name = "s"}) {
+// CHECK-LABEL:   func.func @_QPt8(
+// CHECK-SAME:                     %[[ARG0:.*]]: !fir.boxchar<1> {fir.bindc_name = "s"}) {
+// CHECK:           %[[VAL_0:.*]] = arith.constant 0 : index
+// CHECK:           %[[VAL_1:.*]] = arith.constant 1 : index
+// CHECK:           %[[VAL_2:.*]] = fir.dummy_scope : !fir.dscope
+// CHECK:           %[[VAL_3:.*]] = fir.alloca i32 {bindc_name = "n", uniq_name = "_QFt8En"}
+// CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFt8En"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+// CHECK:           %[[VAL_5:.*]]:2 = fir.unboxchar %[[ARG0]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
+// CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_5]]#0 typeparams %[[VAL_5]]#1 dummy_scope %[[VAL_2]] {uniq_name = "_QFt8Es"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
+// CHECK:           %[[VAL_7:.*]] = fir.address_of(@_QQclX65) : !fir.ref<!fir.char<1>>
+// CHECK:           %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_7]] typeparams %[[VAL_1]] {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QQclX65"} : (!fir.ref<!fir.char<1>>, index) -> (!fir.ref<!fir.char<1>>, !fir.ref<!fir.char<1>>)
+// CHECK:           %[[VAL_9:.*]] = hlfir.designate %[[VAL_8]]#0  substr %[[VAL_1]], %[[VAL_1]]  typeparams %[[VAL_1]] : (!fir.ref<!fir.char<1>>, index, index, index) -> !fir.ref<!fir.char<1>>
+// CHECK:           %[[VAL_10:.*]] = fir.load %[[VAL_9]] : !fir.ref<!fir.char<1>>
+// CHECK:           %[[VAL_11:.*]] = fir.extract_value %[[VAL_10]], [0 : index] : (!fir.char<1>) -> i8
+// CHECK:           %[[VAL_12:.*]] = arith.addi %[[VAL_5]]#1, %[[VAL_1]] : index
+// CHECK:           %[[VAL_13:.*]] = fir.do_loop %[[VAL_14:.*]] = %[[VAL_1]] to %[[VAL_5]]#1 step %[[VAL_1]] iter_args(%[[VAL_15:.*]] = %[[VAL_0]]) -> (index) {
+// CHECK:             %[[VAL_16:.*]] = arith.cmpi eq, %[[VAL_15]], %[[VAL_0]] : index
+// CHECK:             %[[VAL_17:.*]] = fir.if %[[VAL_16]] -> (index) {
+// CHECK:               %[[VAL_18:.*]] = arith.subi %[[VAL_12]], %[[VAL_14]] : index
+// CHECK:               %[[VAL_19:.*]] = hlfir.designate %[[VAL_6]]#0  substr %[[VAL_18]], %[[VAL_18]]  typeparams %[[VAL_1]] : (!fir.boxchar<1>, index, index, index) -> !fir.ref<!fir.char<1>>
+// CHECK:               %[[VAL_20:.*]] = fir.load %[[VAL_19]] : !fir.ref<!fir.char<1>>
+// CHECK:               %[[VAL_21:.*]] = fir.extract_value %[[VAL_20]], [0 : index] : (!fir.char<1>) -> i8
+// CHECK:               %[[VAL_22:.*]] = arith.cmpi eq, %[[VAL_21]], %[[VAL_11]] : i8
+// CHECK:               %[[VAL_23:.*]] = arith.select %[[VAL_22]], %[[VAL_18]], %[[VAL_15]] : index
+// CHECK:               fir.result %[[VAL_23]] : index
+// CHECK:             } else {
+// CHECK:               fir.result %[[VAL_15]] : index
+// CHECK:             }
+// CHECK:             fir.result %[[VAL_17]] : index
+// CHECK:           }
+// CHECK:           %[[VAL_24:.*]] = fir.convert %[[VAL_13]] : (index) -> i32
+// CHECK:           hlfir.assign %[[VAL_24]] to %[[VAL_4]]#0 : i32, !fir.ref<i32>
+// CHECK:           return
+// CHECK:         }
+    %0 = fir.dummy_scope : !fir.dscope
+    %1 = fir.alloca i32 {bindc_name = "n", uniq_name = "_QFt8En"}
+    %2:2 = hlfir.declare %1 {uniq_name = "_QFt8En"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+    %3:2 = fir.unboxchar %arg0 : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
+    %4:2 = hlfir.declare %3#0 typeparams %3#1 dummy_scope %0 {uniq_name = "_QFt8Es"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
+    %5 = fir.address_of(@_QQclX65) : !fir.ref<!fir.char<1>>
+    %c1 = arith.constant 1 : index
+    %6:2 = hlfir.declare %5 typeparams %c1 {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QQclX65"} : (!fir.ref<!fir.char<1>>, index) -> (!fir.ref<!fir.char<1>>, !fir.ref<!fir.char<1>>)
+    %true = arith.constant true
+    %7 = hlfir.index %6#0 in %4#0 back %true : (!fir.ref<!fir.char<1>>, !fir.boxchar<1>, i1) -> i32
+    hlfir.assign %7 to %2#0 : i32, !fir.ref<i32>
+    return
+}
+
diff --git a/flang/test/Lower/OpenMP/infinite-loop-in-construct.f90 b/flang/test/Lower/OpenMP/infinite-loop-in-construct.f90
index 16b400a..f02d0e5 100644
--- a/flang/test/Lower/OpenMP/infinite-loop-in-construct.f90
+++ b/flang/test/Lower/OpenMP/infinite-loop-in-construct.f90
@@ -8,8 +8,10 @@
 ! CHECK:            cf.cond_br %{{[0-9]+}}, ^bb1, ^bb2
 ! CHECK-NEXT:     ^bb1:  // pred: ^bb0
 ! CHECK:            cf.br ^bb2
-! CHECK-NEXT:     ^bb2:  // 3 preds: ^bb0, ^bb1, ^bb2
-! CHECK-NEXT:       cf.br ^bb2
+! CHECK-NEXT:     ^bb2:  // 2 preds: ^bb0, ^bb1
+! CHECK:            cf.br ^bb3
+! CHECK-NEXT:     ^bb3:  // 2 preds: ^bb2, ^bb3
+! CHECK:            cf.br ^bb3
 ! CHECK-NEXT:     }
 
 subroutine sb(ninter, numnod)
diff --git a/flang/test/Semantics/OpenMP/allocate01.f90 b/flang/test/Semantics/OpenMP/allocate01.f90
index e0b084ff0..5280d1b 100644
--- a/flang/test/Semantics/OpenMP/allocate01.f90
+++ b/flang/test/Semantics/OpenMP/allocate01.f90
@@ -20,7 +20,6 @@ use omp_lib
         print *, a
 
     !WARNING: OpenMP directive ALLOCATE has been deprecated, please use ALLOCATORS instead. [-Wopen-mp-usage]
-    !ERROR: List items must be declared in the same scoping unit in which the ALLOCATE directive appears
     !$omp allocate(x) allocator(omp_default_mem_alloc)
       allocate ( x(a), darray(a, b) )
     end subroutine sema
diff --git a/flang/test/Semantics/OpenMP/allocate08.f90 b/flang/test/Semantics/OpenMP/allocate08.f90
index fc950ea..5bfa918 100644
--- a/flang/test/Semantics/OpenMP/allocate08.f90
+++ b/flang/test/Semantics/OpenMP/allocate08.f90
@@ -27,10 +27,12 @@ use AllocateModule
 
   !$omp allocate(x) allocator(omp_default_mem_alloc)
   !$omp allocate(y) allocator(omp_default_mem_alloc)
+  !ERROR: List items must be declared in the same scoping unit in which the ALLOCATE directive appears
   !$omp allocate(z) allocator(omp_default_mem_alloc)
 
   !$omp allocate(x)
   !$omp allocate(y)
+  !ERROR: List items must be declared in the same scoping unit in which the ALLOCATE directive appears
   !$omp allocate(z)
 
   !$omp allocate(w) allocator(custom_allocator)
@@ -40,5 +42,6 @@ use AllocateModule
   !ERROR: If list items within the ALLOCATE directive have the SAVE attribute, are a common block name, or are declared in the scope of a module, then only predefined memory allocator parameters can be used in the allocator clause
   !$omp allocate(y) allocator(custom_allocator)
   !ERROR: If list items within the ALLOCATE directive have the SAVE attribute, are a common block name, or are declared in the scope of a module, then only predefined memory allocator parameters can be used in the allocator clause
+  !ERROR: List items must be declared in the same scoping unit in which the ALLOCATE directive appears
   !$omp allocate(z) allocator(custom_allocator)
 end subroutine allocate
diff --git a/flang/test/Semantics/OpenMP/allocators04.f90 b/flang/test/Semantics/OpenMP/allocators04.f90
index 212e48f..c71c7ca 100644
--- a/flang/test/Semantics/OpenMP/allocators04.f90
+++ b/flang/test/Semantics/OpenMP/allocators04.f90
@@ -22,10 +22,12 @@ subroutine allocate()
     trait(1)%value = default_mem_fb
     custom_allocator = omp_init_allocator(omp_default_mem_space, 1, trait)
 
+    !ERROR: List items must be declared in the same scoping unit in which the ALLOCATORS directive appears
     !$omp allocators allocate(omp_default_mem_alloc: a)
         allocate(a)
 
     !ERROR: If list items within the ALLOCATORS directive have the SAVE attribute, are a common block name, or are declared in the scope of a module, then only predefined memory allocator parameters can be used in the allocator clause
+    !ERROR: List items must be declared in the same scoping unit in which the ALLOCATORS directive appears
     !$omp allocators allocate(custom_allocator: b)
         allocate(b)
 end subroutine
diff --git a/flang/test/Semantics/OpenMP/allocators05.f90 b/flang/test/Semantics/OpenMP/allocators05.f90
index 0e8366a..efacdfa 100644
--- a/flang/test/Semantics/OpenMP/allocators05.f90
+++ b/flang/test/Semantics/OpenMP/allocators05.f90
@@ -15,11 +15,9 @@ subroutine allocate()
     integer, parameter :: LEN = 2
 
     !$omp target private(a, b)
-    !ERROR: List items must be declared in the same scoping unit in which the ALLOCATORS directive appears
     !$omp allocators allocate(omp_default_mem_alloc: a)
         allocate(a(LEN))
     !ERROR: ALLOCATORS directives that appear in a TARGET region must specify an allocator
-    !ERROR: List items must be declared in the same scoping unit in which the ALLOCATORS directive appears
     !$omp allocators allocate(b)
         allocate(b(LEN))
     !$omp end target
diff --git a/libc/shared/math.h b/libc/shared/math.h
index 9ba898e..cccd6a3 100644
--- a/libc/shared/math.h
+++ b/libc/shared/math.h
@@ -45,6 +45,7 @@
 #include "math/exp10.h"
 #include "math/exp10f.h"
 #include "math/exp10f16.h"
+#include "math/exp10m1f.h"
 #include "math/expf.h"
 #include "math/expf16.h"
 #include "math/frexpf.h"
diff --git a/libc/shared/math/exp10m1f.h b/libc/shared/math/exp10m1f.h
new file mode 100644
index 0000000..9093705
--- /dev/null
+++ b/libc/shared/math/exp10m1f.h
@@ -0,0 +1,23 @@
+//===-- Shared exp10m1f function --------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SHARED_MATH_EXP10M1F_H
+#define LLVM_LIBC_SHARED_MATH_EXP10M1F_H
+
+#include "shared/libc_common.h"
+#include "src/__support/math/exp10m1f.h"
+
+namespace LIBC_NAMESPACE_DECL {
+namespace shared {
+
+using math::exp10m1f;
+
+} // namespace shared
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SHARED_MATH_EXP10M1F_H
diff --git a/libc/src/__support/math/CMakeLists.txt b/libc/src/__support/math/CMakeLists.txt
index 12ffa2ab..84c1b15 100644
--- a/libc/src/__support/math/CMakeLists.txt
+++ b/libc/src/__support/math/CMakeLists.txt
@@ -482,6 +482,23 @@ add_header_library(
 )
 
 add_header_library(
+  exp10m1f
+  HDRS
+    exp10m1f.h
+  DEPENDS
+    .exp10f_utils
+    libc.src.errno.errno
+    libc.src.__support.common
+    libc.src.__support.FPUtil.except_value_utils
+    libc.src.__support.FPUtil.fenv_impl
+    libc.src.__support.FPUtil.fp_bits
+    libc.src.__support.FPUtil.multiply_add
+    libc.src.__support.FPUtil.polyeval
+    libc.src.__support.FPUtil.rounding_mode
+    libc.src.__support.macros.optimization
+)
+
+add_header_library(
   erff
   HDRS
     erff.h
diff --git a/libc/src/__support/math/exp10m1f.h b/libc/src/__support/math/exp10m1f.h
new file mode 100644
index 0000000..9fe4ff7
--- /dev/null
+++ b/libc/src/__support/math/exp10m1f.h
@@ -0,0 +1,234 @@
+//===-- Implementation header for exp10m1f ----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC___SUPPORT_MATH_EXP10M1F_H
+#define LLVM_LIBC_SRC___SUPPORT_MATH_EXP10M1F_H
+
+#include "exp10f_utils.h"
+#include "src/__support/FPUtil/FEnvImpl.h"
+#include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/FPUtil/PolyEval.h"
+#include "src/__support/FPUtil/except_value_utils.h"
+#include "src/__support/FPUtil/multiply_add.h"
+#include "src/__support/FPUtil/rounding_mode.h"
+#include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
+#include "src/__support/macros/config.h"
+#include "src/__support/macros/optimization.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+namespace math {
+
+namespace exp10m1f_internal {
+
+#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+static constexpr size_t N_EXCEPTS_LO = 11;
+
+static constexpr fputil::ExceptValues<float, N_EXCEPTS_LO> EXP10M1F_EXCEPTS_LO =
+    {{
+        // x = 0x1.0fe54ep-11, exp10m1f(x) = 0x1.3937eep-10 (RZ)
+        {0x3a07'f2a7U, 0x3a9c'9bf7U, 1U, 0U, 1U},
+        // x = 0x1.80e6eap-11, exp10m1f(x) = 0x1.bb8272p-10 (RZ)
+        {0x3a40'7375U, 0x3add'c139U, 1U, 0U, 1U},
+        // x = -0x1.2a33bcp-51, exp10m1f(x) = -0x1.57515ep-50 (RZ)
+        {0xa615'19deU, 0xa6ab'a8afU, 0U, 1U, 0U},
+        // x = -0x0p+0, exp10m1f(x) = -0x0p+0 (RZ)
+        {0x8000'0000U, 0x8000'0000U, 0U, 0U, 0U},
+        // x = -0x1.b59e08p-31, exp10m1f(x) = -0x1.f7d356p-30 (RZ)
+        {0xb05a'cf04U, 0xb0fb'e9abU, 0U, 1U, 1U},
+        // x = -0x1.bf342p-12, exp10m1f(x) = -0x1.014e02p-10 (RZ)
+        {0xb9df'9a10U, 0xba80'a701U, 0U, 1U, 0U},
+        // x = -0x1.6207fp-11, exp10m1f(x) = -0x1.9746cap-10 (RZ)
+        {0xba31'03f8U, 0xbacb'a365U, 0U, 1U, 1U},
+        // x = -0x1.bd0c66p-11, exp10m1f(x) = -0x1.ffe168p-10 (RZ)
+        {0xba5e'8633U, 0xbaff'f0b4U, 0U, 1U, 1U},
+        // x = -0x1.ffd84cp-10, exp10m1f(x) = -0x1.25faf2p-8 (RZ)
+        {0xbaff'ec26U, 0xbb92'fd79U, 0U, 1U, 0U},
+        // x = -0x1.a74172p-9, exp10m1f(x) = -0x1.e57be2p-8 (RZ)
+        {0xbb53'a0b9U, 0xbbf2'bdf1U, 0U, 1U, 1U},
+        // x = -0x1.cb694cp-9, exp10m1f(x) = -0x1.0764e4p-7 (RZ)
+        {0xbb65'b4a6U, 0xbc03'b272U, 0U, 1U, 0U},
+    }};
+
+static constexpr size_t N_EXCEPTS_HI = 19;
+
+static constexpr fputil::ExceptValues<float, N_EXCEPTS_HI> EXP10M1F_EXCEPTS_HI =
+    {{
+        // (input, RZ output, RU offset, RD offset, RN offset)
+        // x = 0x1.8d31eep-8, exp10m1f(x) = 0x1.cc7e4cp-7 (RZ)
+        {0x3bc6'98f7U, 0x3c66'3f26U, 1U, 0U, 1U},
+        // x = 0x1.915fcep-8, exp10m1f(x) = 0x1.d15f72p-7 (RZ)
+        {0x3bc8'afe7U, 0x3c68'afb9U, 1U, 0U, 0U},
+        // x = 0x1.bcf982p-8, exp10m1f(x) = 0x1.022928p-6 (RZ)
+        {0x3bde'7cc1U, 0x3c81'1494U, 1U, 0U, 1U},
+        // x = 0x1.99ff0ap-7, exp10m1f(x) = 0x1.dee416p-6 (RZ)
+        {0x3c4c'ff85U, 0x3cef'720bU, 1U, 0U, 0U},
+        // x = 0x1.75ea14p-6, exp10m1f(x) = 0x1.b9ff16p-5 (RZ)
+        {0x3cba'f50aU, 0x3d5c'ff8bU, 1U, 0U, 0U},
+        // x = 0x1.f81b64p-6, exp10m1f(x) = 0x1.2cb6bcp-4 (RZ)
+        {0x3cfc'0db2U, 0x3d96'5b5eU, 1U, 0U, 0U},
+        // x = 0x1.fafecp+3, exp10m1f(x) = 0x1.8c880ap+52 (RZ)
+        {0x417d'7f60U, 0x59c6'4405U, 1U, 0U, 0U},
+        // x = -0x1.3bf094p-8, exp10m1f(x) = -0x1.69ba4ap-7 (RZ)
+        {0xbb9d'f84aU, 0xbc34'dd25U, 0U, 1U, 0U},
+        // x = -0x1.4558bcp-8, exp10m1f(x) = -0x1.746fb8p-7 (RZ)
+        {0xbba2'ac5eU, 0xbc3a'37dcU, 0U, 1U, 1U},
+        // x = -0x1.4bb43p-8, exp10m1f(x) = -0x1.7babe4p-7 (RZ)
+        {0xbba5'da18U, 0xbc3d'd5f2U, 0U, 1U, 1U},
+        // x = -0x1.776cc8p-8, exp10m1f(x) = -0x1.ad62c4p-7 (RZ)
+        {0xbbbb'b664U, 0xbc56'b162U, 0U, 1U, 0U},
+        // x = -0x1.f024cp-8, exp10m1f(x) = -0x1.1b20d6p-6 (RZ)
+        {0xbbf8'1260U, 0xbc8d'906bU, 0U, 1U, 1U},
+        // x = -0x1.f510eep-8, exp10m1f(x) = -0x1.1de9aap-6 (RZ)
+        {0xbbfa'8877U, 0xbc8e'f4d5U, 0U, 1U, 0U},
+        // x = -0x1.0b43c4p-7, exp10m1f(x) = -0x1.30d418p-6 (RZ)
+        {0xbc05'a1e2U, 0xbc98'6a0cU, 0U, 1U, 0U},
+        // x = -0x1.245ee4p-7, exp10m1f(x) = -0x1.4d2b86p-6 (RZ)
+        {0xbc12'2f72U, 0xbca6'95c3U, 0U, 1U, 0U},
+        // x = -0x1.f9f2dap-7, exp10m1f(x) = -0x1.1e2186p-5 (RZ)
+        {0xbc7c'f96dU, 0xbd0f'10c3U, 0U, 1U, 0U},
+        // x = -0x1.08e42p-6, exp10m1f(x) = -0x1.2b5c4p-5 (RZ)
+        {0xbc84'7210U, 0xbd15'ae20U, 0U, 1U, 1U},
+        // x = -0x1.0cdc44p-5, exp10m1f(x) = -0x1.2a2152p-4 (RZ)
+        {0xbd06'6e22U, 0xbd95'10a9U, 0U, 1U, 1U},
+        // x = -0x1.ca4322p-5, exp10m1f(x) = -0x1.ef073p-4 (RZ)
+        {0xbd65'2191U, 0xbdf7'8398U, 0U, 1U, 1U},
+    }};
+#endif // !LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+
+} // namespace exp10m1f_internal
+
+LIBC_INLINE static constexpr float exp10m1f(float x) {
+  using namespace exp10m1f_internal;
+  using FPBits = fputil::FPBits<float>;
+  FPBits xbits(x);
+
+  uint32_t x_u = xbits.uintval();
+  uint32_t x_abs = x_u & 0x7fff'ffffU;
+
+  // When x >= log10(2^128), or x is nan
+  if (LIBC_UNLIKELY(xbits.is_pos() && x_u >= 0x421a'209bU)) {
+    if (xbits.is_finite()) {
+      int rounding = fputil::quick_get_round();
+      if (rounding == FE_DOWNWARD || rounding == FE_TOWARDZERO)
+        return FPBits::max_normal().get_val();
+
+      fputil::set_errno_if_required(ERANGE);
+      fputil::raise_except_if_required(FE_OVERFLOW);
+    }
+
+    // x >= log10(2^128) and 10^x - 1 rounds to +inf, or x is +inf or nan
+    return x + FPBits::inf().get_val();
+  }
+
+  // When |x| <= log10(2) * 2^(-6)
+  if (LIBC_UNLIKELY(x_abs <= 0x3b9a'209bU)) {
+#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+    if (auto r = EXP10M1F_EXCEPTS_LO.lookup(x_u); LIBC_UNLIKELY(r.has_value()))
+      return r.value();
+#endif // !LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+
+    double dx = x;
+    double dx_sq = dx * dx;
+    double c0 = dx * Exp10Base::COEFFS[0];
+    double c1 =
+        fputil::multiply_add(dx, Exp10Base::COEFFS[2], Exp10Base::COEFFS[1]);
+    double c2 =
+        fputil::multiply_add(dx, Exp10Base::COEFFS[4], Exp10Base::COEFFS[3]);
+    // 10^dx - 1 ~ (1 + COEFFS[0] * dx + ... + COEFFS[4] * dx^5) - 1
+    //           = COEFFS[0] * dx + ... + COEFFS[4] * dx^5
+    return static_cast<float>(fputil::polyeval(dx_sq, c0, c1, c2));
+  }
+
+  // When x <= log10(2^-25), or x is nan
+  if (LIBC_UNLIKELY(x_u >= 0xc0f0d2f1)) {
+    // exp10m1(-inf) = -1
+    if (xbits.is_inf())
+      return -1.0f;
+    // exp10m1(nan) = nan
+    if (xbits.is_nan())
+      return x;
+
+    int rounding = fputil::quick_get_round();
+    if (rounding == FE_UPWARD || rounding == FE_TOWARDZERO ||
+        (rounding == FE_TONEAREST && x_u == 0xc0f0d2f1))
+      return -0x1.ffff'fep-1f; // -1.0f + 0x1.0p-24f
+
+    fputil::set_errno_if_required(ERANGE);
+    fputil::raise_except_if_required(FE_UNDERFLOW);
+    return -1.0f;
+  }
+
+  // Exact outputs when x = 1, 2, ..., 10.
+  // Quick check mask: 0x800f'ffffU = ~(bits of 1.0f | ... | bits of 10.0f)
+  if (LIBC_UNLIKELY((x_u & 0x800f'ffffU) == 0)) {
+    switch (x_u) {
+    case 0x3f800000U: // x = 1.0f
+      return 9.0f;
+    case 0x40000000U: // x = 2.0f
+      return 99.0f;
+    case 0x40400000U: // x = 3.0f
+      return 999.0f;
+    case 0x40800000U: // x = 4.0f
+      return 9'999.0f;
+    case 0x40a00000U: // x = 5.0f
+      return 99'999.0f;
+    case 0x40c00000U: // x = 6.0f
+      return 999'999.0f;
+    case 0x40e00000U: // x = 7.0f
+      return 9'999'999.0f;
+    case 0x41000000U: { // x = 8.0f
+      int rounding = fputil::quick_get_round();
+      if (rounding == FE_UPWARD || rounding == FE_TONEAREST)
+        return 100'000'000.0f;
+      return 99'999'992.0f;
+    }
+    case 0x41100000U: { // x = 9.0f
+      int rounding = fputil::quick_get_round();
+      if (rounding == FE_UPWARD || rounding == FE_TONEAREST)
+        return 1'000'000'000.0f;
+      return 999'999'936.0f;
+    }
+    case 0x41200000U: { // x = 10.0f
+      int rounding = fputil::quick_get_round();
+      if (rounding == FE_UPWARD || rounding == FE_TONEAREST)
+        return 10'000'000'000.0f;
+      return 9'999'998'976.0f;
+    }
+    }
+  }
+
+#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+  if (auto r = EXP10M1F_EXCEPTS_HI.lookup(x_u); LIBC_UNLIKELY(r.has_value()))
+    return r.value();
+#endif // !LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+
+  // Range reduction: 10^x = 2^(mid + hi) * 10^lo
+  //   rr = (2^(mid + hi), lo)
+  auto rr = exp_b_range_reduc<Exp10Base>(x);
+
+  // The low part is approximated by a degree-5 minimax polynomial.
+  // 10^lo ~ 1 + COEFFS[0] * lo + ... + COEFFS[4] * lo^5
+  double lo_sq = rr.lo * rr.lo;
+  double c0 = fputil::multiply_add(rr.lo, Exp10Base::COEFFS[0], 1.0);
+  double c1 =
+      fputil::multiply_add(rr.lo, Exp10Base::COEFFS[2], Exp10Base::COEFFS[1]);
+  double c2 =
+      fputil::multiply_add(rr.lo, Exp10Base::COEFFS[4], Exp10Base::COEFFS[3]);
+  double exp10_lo = fputil::polyeval(lo_sq, c0, c1, c2);
+  // 10^x - 1 = 2^(mid + hi) * 10^lo - 1
+  //          ~ mh * exp10_lo - 1
+  return static_cast<float>(fputil::multiply_add(exp10_lo, rr.mh, -1.0));
+}
+
+} // namespace math
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC___SUPPORT_MATH_EXP10M1F_H
diff --git a/libc/src/math/generic/CMakeLists.txt b/libc/src/math/generic/CMakeLists.txt
index 7cd34fa..8074a39 100644
--- a/libc/src/math/generic/CMakeLists.txt
+++ b/libc/src/math/generic/CMakeLists.txt
@@ -1593,16 +1593,7 @@ add_entrypoint_object(
   HDRS
     ../exp10m1f.h
   DEPENDS
-    libc.src.errno.errno
-    libc.src.__support.common
-    libc.src.__support.FPUtil.except_value_utils
-    libc.src.__support.FPUtil.fenv_impl
-    libc.src.__support.FPUtil.fp_bits
-    libc.src.__support.FPUtil.multiply_add
-    libc.src.__support.FPUtil.polyeval
-    libc.src.__support.FPUtil.rounding_mode
-    libc.src.__support.macros.optimization
-    libc.src.__support.math.exp10f_utils
+    libc.src.__support.math.exp10m1f
 )
 
 add_entrypoint_object(
diff --git a/libc/src/math/generic/exp10m1f.cpp b/libc/src/math/generic/exp10m1f.cpp
index 8589e3f..87980b7 100644
--- a/libc/src/math/generic/exp10m1f.cpp
+++ b/libc/src/math/generic/exp10m1f.cpp
@@ -7,215 +7,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/math/exp10m1f.h"
-#include "src/__support/FPUtil/FEnvImpl.h"
-#include "src/__support/FPUtil/FPBits.h"
-#include "src/__support/FPUtil/PolyEval.h"
-#include "src/__support/FPUtil/except_value_utils.h"
-#include "src/__support/FPUtil/multiply_add.h"
-#include "src/__support/FPUtil/rounding_mode.h"
-#include "src/__support/common.h"
-#include "src/__support/libc_errno.h"
-#include "src/__support/macros/config.h"
-#include "src/__support/macros/optimization.h"
-#include "src/__support/math/exp10f_utils.h"
+#include "src/__support/math/exp10m1f.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
-#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
-static constexpr size_t N_EXCEPTS_LO = 11;
-
-static constexpr fputil::ExceptValues<float, N_EXCEPTS_LO> EXP10M1F_EXCEPTS_LO =
-    {{
-        // x = 0x1.0fe54ep-11, exp10m1f(x) = 0x1.3937eep-10 (RZ)
-        {0x3a07'f2a7U, 0x3a9c'9bf7U, 1U, 0U, 1U},
-        // x = 0x1.80e6eap-11, exp10m1f(x) = 0x1.bb8272p-10 (RZ)
-        {0x3a40'7375U, 0x3add'c139U, 1U, 0U, 1U},
-        // x = -0x1.2a33bcp-51, exp10m1f(x) = -0x1.57515ep-50 (RZ)
-        {0xa615'19deU, 0xa6ab'a8afU, 0U, 1U, 0U},
-        // x = -0x0p+0, exp10m1f(x) = -0x0p+0 (RZ)
-        {0x8000'0000U, 0x8000'0000U, 0U, 0U, 0U},
-        // x = -0x1.b59e08p-31, exp10m1f(x) = -0x1.f7d356p-30 (RZ)
-        {0xb05a'cf04U, 0xb0fb'e9abU, 0U, 1U, 1U},
-        // x = -0x1.bf342p-12, exp10m1f(x) = -0x1.014e02p-10 (RZ)
-        {0xb9df'9a10U, 0xba80'a701U, 0U, 1U, 0U},
-        // x = -0x1.6207fp-11, exp10m1f(x) = -0x1.9746cap-10 (RZ)
-        {0xba31'03f8U, 0xbacb'a365U, 0U, 1U, 1U},
-        // x = -0x1.bd0c66p-11, exp10m1f(x) = -0x1.ffe168p-10 (RZ)
-        {0xba5e'8633U, 0xbaff'f0b4U, 0U, 1U, 1U},
-        // x = -0x1.ffd84cp-10, exp10m1f(x) = -0x1.25faf2p-8 (RZ)
-        {0xbaff'ec26U, 0xbb92'fd79U, 0U, 1U, 0U},
-        // x = -0x1.a74172p-9, exp10m1f(x) = -0x1.e57be2p-8 (RZ)
-        {0xbb53'a0b9U, 0xbbf2'bdf1U, 0U, 1U, 1U},
-        // x = -0x1.cb694cp-9, exp10m1f(x) = -0x1.0764e4p-7 (RZ)
-        {0xbb65'b4a6U, 0xbc03'b272U, 0U, 1U, 0U},
-    }};
-
-static constexpr size_t N_EXCEPTS_HI = 19;
-
-static constexpr fputil::ExceptValues<float, N_EXCEPTS_HI> EXP10M1F_EXCEPTS_HI =
-    {{
-        // (input, RZ output, RU offset, RD offset, RN offset)
-        // x = 0x1.8d31eep-8, exp10m1f(x) = 0x1.cc7e4cp-7 (RZ)
-        {0x3bc6'98f7U, 0x3c66'3f26U, 1U, 0U, 1U},
-        // x = 0x1.915fcep-8, exp10m1f(x) = 0x1.d15f72p-7 (RZ)
-        {0x3bc8'afe7U, 0x3c68'afb9U, 1U, 0U, 0U},
-        // x = 0x1.bcf982p-8, exp10m1f(x) = 0x1.022928p-6 (RZ)
-        {0x3bde'7cc1U, 0x3c81'1494U, 1U, 0U, 1U},
-        // x = 0x1.99ff0ap-7, exp10m1f(x) = 0x1.dee416p-6 (RZ)
-        {0x3c4c'ff85U, 0x3cef'720bU, 1U, 0U, 0U},
-        // x = 0x1.75ea14p-6, exp10m1f(x) = 0x1.b9ff16p-5 (RZ)
-        {0x3cba'f50aU, 0x3d5c'ff8bU, 1U, 0U, 0U},
-        // x = 0x1.f81b64p-6, exp10m1f(x) = 0x1.2cb6bcp-4 (RZ)
-        {0x3cfc'0db2U, 0x3d96'5b5eU, 1U, 0U, 0U},
-        // x = 0x1.fafecp+3, exp10m1f(x) = 0x1.8c880ap+52 (RZ)
-        {0x417d'7f60U, 0x59c6'4405U, 1U, 0U, 0U},
-        // x = -0x1.3bf094p-8, exp10m1f(x) = -0x1.69ba4ap-7 (RZ)
-        {0xbb9d'f84aU, 0xbc34'dd25U, 0U, 1U, 0U},
-        // x = -0x1.4558bcp-8, exp10m1f(x) = -0x1.746fb8p-7 (RZ)
-        {0xbba2'ac5eU, 0xbc3a'37dcU, 0U, 1U, 1U},
-        // x = -0x1.4bb43p-8, exp10m1f(x) = -0x1.7babe4p-7 (RZ)
-        {0xbba5'da18U, 0xbc3d'd5f2U, 0U, 1U, 1U},
-        // x = -0x1.776cc8p-8, exp10m1f(x) = -0x1.ad62c4p-7 (RZ)
-        {0xbbbb'b664U, 0xbc56'b162U, 0U, 1U, 0U},
-        // x = -0x1.f024cp-8, exp10m1f(x) = -0x1.1b20d6p-6 (RZ)
-        {0xbbf8'1260U, 0xbc8d'906bU, 0U, 1U, 1U},
-        // x = -0x1.f510eep-8, exp10m1f(x) = -0x1.1de9aap-6 (RZ)
-        {0xbbfa'8877U, 0xbc8e'f4d5U, 0U, 1U, 0U},
-        // x = -0x1.0b43c4p-7, exp10m1f(x) = -0x1.30d418p-6 (RZ)
-        {0xbc05'a1e2U, 0xbc98'6a0cU, 0U, 1U, 0U},
-        // x = -0x1.245ee4p-7, exp10m1f(x) = -0x1.4d2b86p-6 (RZ)
-        {0xbc12'2f72U, 0xbca6'95c3U, 0U, 1U, 0U},
-        // x = -0x1.f9f2dap-7, exp10m1f(x) = -0x1.1e2186p-5 (RZ)
-        {0xbc7c'f96dU, 0xbd0f'10c3U, 0U, 1U, 0U},
-        // x = -0x1.08e42p-6, exp10m1f(x) = -0x1.2b5c4p-5 (RZ)
-        {0xbc84'7210U, 0xbd15'ae20U, 0U, 1U, 1U},
-        // x = -0x1.0cdc44p-5, exp10m1f(x) = -0x1.2a2152p-4 (RZ)
-        {0xbd06'6e22U, 0xbd95'10a9U, 0U, 1U, 1U},
-        // x = -0x1.ca4322p-5, exp10m1f(x) = -0x1.ef073p-4 (RZ)
-        {0xbd65'2191U, 0xbdf7'8398U, 0U, 1U, 1U},
-    }};
-#endif // !LIBC_MATH_HAS_SKIP_ACCURATE_PASS
-
-LLVM_LIBC_FUNCTION(float, exp10m1f, (float x)) {
-  using FPBits = fputil::FPBits<float>;
-  FPBits xbits(x);
-
-  uint32_t x_u = xbits.uintval();
-  uint32_t x_abs = x_u & 0x7fff'ffffU;
-
-  // When x >= log10(2^128), or x is nan
-  if (LIBC_UNLIKELY(xbits.is_pos() && x_u >= 0x421a'209bU)) {
-    if (xbits.is_finite()) {
-      int rounding = fputil::quick_get_round();
-      if (rounding == FE_DOWNWARD || rounding == FE_TOWARDZERO)
-        return FPBits::max_normal().get_val();
-
-      fputil::set_errno_if_required(ERANGE);
-      fputil::raise_except_if_required(FE_OVERFLOW);
-    }
-
-    // x >= log10(2^128) and 10^x - 1 rounds to +inf, or x is +inf or nan
-    return x + FPBits::inf().get_val();
-  }
-
-  // When |x| <= log10(2) * 2^(-6)
-  if (LIBC_UNLIKELY(x_abs <= 0x3b9a'209bU)) {
-#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
-    if (auto r = EXP10M1F_EXCEPTS_LO.lookup(x_u); LIBC_UNLIKELY(r.has_value()))
-      return r.value();
-#endif // !LIBC_MATH_HAS_SKIP_ACCURATE_PASS
-
-    double dx = x;
-    double dx_sq = dx * dx;
-    double c0 = dx * Exp10Base::COEFFS[0];
-    double c1 =
-        fputil::multiply_add(dx, Exp10Base::COEFFS[2], Exp10Base::COEFFS[1]);
-    double c2 =
-        fputil::multiply_add(dx, Exp10Base::COEFFS[4], Exp10Base::COEFFS[3]);
-    // 10^dx - 1 ~ (1 + COEFFS[0] * dx + ... + COEFFS[4] * dx^5) - 1
-    //           = COEFFS[0] * dx + ... + COEFFS[4] * dx^5
-    return static_cast<float>(fputil::polyeval(dx_sq, c0, c1, c2));
-  }
-
-  // When x <= log10(2^-25), or x is nan
-  if (LIBC_UNLIKELY(x_u >= 0xc0f0d2f1)) {
-    // exp10m1(-inf) = -1
-    if (xbits.is_inf())
-      return -1.0f;
-    // exp10m1(nan) = nan
-    if (xbits.is_nan())
-      return x;
-
-    int rounding = fputil::quick_get_round();
-    if (rounding == FE_UPWARD || rounding == FE_TOWARDZERO ||
-        (rounding == FE_TONEAREST && x_u == 0xc0f0d2f1))
-      return -0x1.ffff'fep-1f; // -1.0f + 0x1.0p-24f
-
-    fputil::set_errno_if_required(ERANGE);
-    fputil::raise_except_if_required(FE_UNDERFLOW);
-    return -1.0f;
-  }
-
-  // Exact outputs when x = 1, 2, ..., 10.
-  // Quick check mask: 0x800f'ffffU = ~(bits of 1.0f | ... | bits of 10.0f)
-  if (LIBC_UNLIKELY((x_u & 0x800f'ffffU) == 0)) {
-    switch (x_u) {
-    case 0x3f800000U: // x = 1.0f
-      return 9.0f;
-    case 0x40000000U: // x = 2.0f
-      return 99.0f;
-    case 0x40400000U: // x = 3.0f
-      return 999.0f;
-    case 0x40800000U: // x = 4.0f
-      return 9'999.0f;
-    case 0x40a00000U: // x = 5.0f
-      return 99'999.0f;
-    case 0x40c00000U: // x = 6.0f
-      return 999'999.0f;
-    case 0x40e00000U: // x = 7.0f
-      return 9'999'999.0f;
-    case 0x41000000U: { // x = 8.0f
-      int rounding = fputil::quick_get_round();
-      if (rounding == FE_UPWARD || rounding == FE_TONEAREST)
-        return 100'000'000.0f;
-      return 99'999'992.0f;
-    }
-    case 0x41100000U: { // x = 9.0f
-      int rounding = fputil::quick_get_round();
-      if (rounding == FE_UPWARD || rounding == FE_TONEAREST)
-        return 1'000'000'000.0f;
-      return 999'999'936.0f;
-    }
-    case 0x41200000U: { // x = 10.0f
-      int rounding = fputil::quick_get_round();
-      if (rounding == FE_UPWARD || rounding == FE_TONEAREST)
-        return 10'000'000'000.0f;
-      return 9'999'998'976.0f;
-    }
-    }
-  }
-
-#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
-  if (auto r = EXP10M1F_EXCEPTS_HI.lookup(x_u); LIBC_UNLIKELY(r.has_value()))
-    return r.value();
-#endif // !LIBC_MATH_HAS_SKIP_ACCURATE_PASS
-
-  // Range reduction: 10^x = 2^(mid + hi) * 10^lo
-  //   rr = (2^(mid + hi), lo)
-  auto rr = exp_b_range_reduc<Exp10Base>(x);
-
-  // The low part is approximated by a degree-5 minimax polynomial.
-  // 10^lo ~ 1 + COEFFS[0] * lo + ... + COEFFS[4] * lo^5
-  double lo_sq = rr.lo * rr.lo;
-  double c0 = fputil::multiply_add(rr.lo, Exp10Base::COEFFS[0], 1.0);
-  double c1 =
-      fputil::multiply_add(rr.lo, Exp10Base::COEFFS[2], Exp10Base::COEFFS[1]);
-  double c2 =
-      fputil::multiply_add(rr.lo, Exp10Base::COEFFS[4], Exp10Base::COEFFS[3]);
-  double exp10_lo = fputil::polyeval(lo_sq, c0, c1, c2);
-  // 10^x - 1 = 2^(mid + hi) * 10^lo - 1
-  //          ~ mh * exp10_lo - 1
-  return static_cast<float>(fputil::multiply_add(exp10_lo, rr.mh, -1.0));
-}
+LLVM_LIBC_FUNCTION(float, exp10m1f, (float x)) { return math::exp10m1f(x); }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/string/memory_utils/op_generic.h b/libc/src/string/memory_utils/op_generic.h
index 010f218..a86cbd8 100644
--- a/libc/src/string/memory_utils/op_generic.h
+++ b/libc/src/string/memory_utils/op_generic.h
@@ -41,12 +41,22 @@ static_assert((UINTPTR_MAX == 4294967295U) ||
               "We currently only support 32- or 64-bit platforms");
 
 #ifdef LIBC_COMPILER_IS_MSVC
-
+#ifdef LIBC_TARGET_ARCH_IS_X86
 namespace LIBC_NAMESPACE_DECL {
 using generic_v128 = __m128i;
 using generic_v256 = __m256i;
 using generic_v512 = __m512i;
 } // namespace LIBC_NAMESPACE_DECL
+#else
+// Special handling when target does not have real vector types.
+// We can potentially use uint8x16_t etc. However, MSVC does not provide
+// subscript operation.
+namespace LIBC_NAMESPACE_DECL {
+struct alignas(16) generic_v128 : public cpp::array<uint8_t, 16> {};
+struct alignas(32) generic_v256 : public cpp::array<uint8_t, 32> {};
+struct alignas(64) generic_v512 : public cpp::array<uint8_t, 64> {};
+} // namespace LIBC_NAMESPACE_DECL
+#endif
 
 #else
 namespace LIBC_NAMESPACE_DECL {
@@ -159,7 +169,8 @@ template <typename T> struct Memset {
 
   LIBC_INLINE static void block(Ptr dst, uint8_t value) {
     if constexpr (is_scalar_v<T> || is_vector_v<T>) {
-      store<T>(dst, splat<T>(value));
+      // Avoid ambiguous call due to ADL
+      generic::store<T>(dst, splat<T>(value));
     } else if constexpr (is_array_v<T>) {
       using value_type = typename T::value_type;
       const auto Splat = splat<value_type>(value);
diff --git a/libc/test/UnitTest/FEnvSafeTest.cpp b/libc/test/UnitTest/FEnvSafeTest.cpp
index 2730de3..4393f9d 100644
--- a/libc/test/UnitTest/FEnvSafeTest.cpp
+++ b/libc/test/UnitTest/FEnvSafeTest.cpp
@@ -43,7 +43,7 @@ void FEnvSafeTest::set_fenv(const fenv_t &fenv) {
 
 void FEnvSafeTest::expect_fenv_eq(const fenv_t &before_fenv,
                                   const fenv_t &after_fenv) {
-#if defined(LIBC_TARGET_ARCH_IS_AARCH64)
+#if defined(LIBC_TARGET_ARCH_IS_AARCH64) && !defined(LIBC_COMPILER_IS_MSVC)
   using FPState = LIBC_NAMESPACE::fputil::FEnv::FPState;
   const FPState &before_state = reinterpret_cast<const FPState &>(before_fenv);
   const FPState &after_state = reinterpret_cast<const FPState &>(after_fenv);
diff --git a/libc/test/shared/CMakeLists.txt b/libc/test/shared/CMakeLists.txt
index 9f3e983..13a0aae 100644
--- a/libc/test/shared/CMakeLists.txt
+++ b/libc/test/shared/CMakeLists.txt
@@ -36,6 +36,7 @@ add_fp_unittest(
     libc.src.__support.math.cospif
     libc.src.__support.math.cospif16
     libc.src.__support.math.dsqrtl
+    libc.src.__support.math.exp10m1f
     libc.src.__support.math.erff
     libc.src.__support.math.exp
     libc.src.__support.math.exp10
diff --git a/libc/test/shared/shared_math_test.cpp b/libc/test/shared/shared_math_test.cpp
index 655e7fb..25bf5ad 100644
--- a/libc/test/shared/shared_math_test.cpp
+++ b/libc/test/shared/shared_math_test.cpp
@@ -57,6 +57,7 @@ TEST(LlvmLibcSharedMathTest, AllFloat) {
   EXPECT_FP_EQ(0x1p+0f, LIBC_NAMESPACE::shared::cosf(0.0f));
   EXPECT_FP_EQ(0x1p+0f, LIBC_NAMESPACE::shared::coshf(0.0f));
   EXPECT_FP_EQ(0x1p+0f, LIBC_NAMESPACE::shared::cospif(0.0f));
+  EXPECT_FP_EQ(0x0p+0f, LIBC_NAMESPACE::shared::exp10m1f(0.0f));
   EXPECT_FP_EQ(0x0p+0f, LIBC_NAMESPACE::shared::erff(0.0f));
   EXPECT_FP_EQ(0x1p+0f, LIBC_NAMESPACE::shared::exp10f(0.0f));
   EXPECT_FP_EQ(0x1p+0f, LIBC_NAMESPACE::shared::expf(0.0f));
diff --git a/libclc/Maintainers.md b/libclc/Maintainers.md
index ac869b6..cdd84e0 100644
--- a/libclc/Maintainers.md
+++ b/libclc/Maintainers.md
@@ -10,8 +10,17 @@ The following people are the active maintainers for the project. Please reach
 out to them for code reviews, questions about their area of expertise, or other
 assistance.
 
-Fraser Cormack \
-fraser@codeplay.com (email), [frasercrmck](https://github.com/frasercrmck) (GitHub)
+Wenju He \
+wenju.he@intel.com (email), [wenju-he](https://github.com/wenju-he) (GitHub)
 
 Tom Stellard \
 tstellar@redhat.com (email), [tstellar](https://github.com/tstellar) (GitHub)
+
+## Inactive Maintainers
+
+The following people have graciously spent time performing maintainership
+responsibilities but are no longer active in that role. Thank you for all your
+help with the success of the project!
+
+Fraser Cormack \
+frasercrmck@pm.me (email), [frasercrmck](https://github.com/frasercrmck) (GitHub)
diff --git a/libcxx/docs/ReleaseNotes/22.rst b/libcxx/docs/ReleaseNotes/22.rst
index b5dcfc7..87d86c134 100644
--- a/libcxx/docs/ReleaseNotes/22.rst
+++ b/libcxx/docs/ReleaseNotes/22.rst
@@ -64,6 +64,8 @@ Improvements and New Features
 - Multiple internal types have been refactored to use ``[[no_unique_address]]``, resulting in faster compile times and
   reduced debug information.
 
+- The performance of ``std::find`` has been improved by up to 2x for integral types
+
 Deprecations and Removals
 -------------------------
 
diff --git a/libcxx/docs/TestingLibcxx.rst b/libcxx/docs/TestingLibcxx.rst
index 2277910..6171629 100644
--- a/libcxx/docs/TestingLibcxx.rst
+++ b/libcxx/docs/TestingLibcxx.rst
@@ -482,7 +482,7 @@ when running the benchmarks. For example,
 
 .. code-block:: bash
 
-  $ libcxx/utils/libcxx-lit <build> libcxx/test/benchmarks/string.bench.cpp --show-all --param optimization=speed
+  $ libcxx/utils/libcxx-lit <build> libcxx/test/benchmarks/containers/string.bench.cpp --show-all --param optimization=speed
 
 Note that benchmarks are only dry-run when run via the ``check-cxx`` target since
 we only want to make sure they don't rot. Do not rely on the results of benchmarks
@@ -504,7 +504,7 @@ more benchmarks, as usual:
 .. code-block:: bash
 
   $ cmake -S runtimes -B <build> [...]
-  $ libcxx/utils/libcxx-lit <build> libcxx/test/benchmarks/string.bench.cpp --param optimization=speed
+  $ libcxx/utils/libcxx-lit <build> libcxx/test/benchmarks/containers/string.bench.cpp --param optimization=speed
 
 Then, get the consolidated benchmark output for that run using ``consolidate-benchmarks``:
 
diff --git a/libcxx/include/__algorithm/find.h b/libcxx/include/__algorithm/find.h
index 8c8cb58..5f32ae8 100644
--- a/libcxx/include/__algorithm/find.h
+++ b/libcxx/include/__algorithm/find.h
@@ -12,6 +12,7 @@
 
 #include <__algorithm/find_segment_if.h>
 #include <__algorithm/min.h>
+#include <__algorithm/simd_utils.h>
 #include <__algorithm/unwrap_iter.h>
 #include <__bit/countr.h>
 #include <__bit/invert_if.h>
@@ -44,39 +45,102 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 // generic implementation
 template <class _Iter, class _Sent, class _Tp, class _Proj>
 _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _Iter
-__find(_Iter __first, _Sent __last, const _Tp& __value, _Proj& __proj) {
+__find_loop(_Iter __first, _Sent __last, const _Tp& __value, _Proj& __proj) {
   for (; __first != __last; ++__first)
     if (std::__invoke(__proj, *__first) == __value)
       break;
   return __first;
 }
 
-// trivially equality comparable implementations
-template <class _Tp,
-          class _Up,
-          class _Proj,
-          __enable_if_t<__is_identity<_Proj>::value && __libcpp_is_trivially_equality_comparable<_Tp, _Up>::value &&
-                            sizeof(_Tp) == 1,
-                        int> = 0>
-_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _Tp* __find(_Tp* __first, _Tp* __last, const _Up& __value, _Proj&) {
-  if (auto __ret = std::__constexpr_memchr(__first, __value, __last - __first))
-    return __ret;
-  return __last;
+template <class _Iter, class _Sent, class _Tp, class _Proj>
+_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _Iter
+__find(_Iter __first, _Sent __last, const _Tp& __value, _Proj& __proj) {
+  return std::__find_loop(std::move(__first), std::move(__last), __value, __proj);
 }
 
-#if _LIBCPP_HAS_WIDE_CHARACTERS
-template <class _Tp,
-          class _Up,
-          class _Proj,
-          __enable_if_t<__is_identity<_Proj>::value && __libcpp_is_trivially_equality_comparable<_Tp, _Up>::value &&
-                            sizeof(_Tp) == sizeof(wchar_t) && _LIBCPP_ALIGNOF(_Tp) >= _LIBCPP_ALIGNOF(wchar_t),
-                        int> = 0>
+#if _LIBCPP_VECTORIZE_ALGORITHMS
+template <class _Tp, class _Up>
+[[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI
+_LIBCPP_CONSTEXPR_SINCE_CXX14 _Tp* __find_vectorized(_Tp* __first, _Tp* __last, _Up __value) {
+  if (!__libcpp_is_constant_evaluated()) {
+    constexpr size_t __unroll_count = 4;
+    constexpr size_t __vec_size     = __native_vector_size<_Tp>;
+    using __vec                     = __simd_vector<_Tp, __vec_size>;
+
+    auto __orig_first = __first;
+
+    auto __values = static_cast<__simd_vector<_Up, __vec_size>>(__value); // broadcast the value
+    while (static_cast<size_t>(__last - __first) >= __unroll_count * __vec_size) [[__unlikely__]] {
+      __vec __lhs[__unroll_count];
+
+      for (size_t __i = 0; __i != __unroll_count; ++__i)
+        __lhs[__i] = std::__load_vector<__vec>(__first + __i * __vec_size);
+
+      for (size_t __i = 0; __i != __unroll_count; ++__i) {
+        if (auto __cmp_res = __lhs[__i] == __values; std::__any_of(__cmp_res)) {
+          auto __offset = __i * __vec_size + std::__find_first_set(__cmp_res);
+          return __first + __offset;
+        }
+      }
+
+      __first += __unroll_count * __vec_size;
+    }
+
+    // check the remaining 0-3 vectors
+    while (static_cast<size_t>(__last - __first) >= __vec_size) {
+      if (auto __cmp_res = std::__load_vector<__vec>(__first) == __values; std::__any_of(__cmp_res)) {
+        return __first + std::__find_first_set(__cmp_res);
+      }
+      __first += __vec_size;
+    }
+
+    if (__last - __first == 0)
+      return __first;
+
+    // Check if we can load elements in front of the current pointer. If that's the case load a vector at
+    // (last - vector_size) to check the remaining elements
+    if (static_cast<size_t>(__first - __orig_first) >= __vec_size) {
+      __first = __last - __vec_size;
+      return __first + std::__find_first_set(std::__load_vector<__vec>(__first) == __values);
+    }
+  }
+
+  __identity __proj;
+  return std::__find_loop(__first, __last, __value, __proj);
+}
+#endif
+
+#ifndef _LIBCPP_CXX03_LANG
+// trivially equality comparable implementations
+template <
+    class _Tp,
+    class _Up,
+    class _Proj,
+    __enable_if_t<__is_identity<_Proj>::value && __libcpp_is_trivially_equality_comparable<_Tp, _Up>::value, int> = 0>
 _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _Tp* __find(_Tp* __first, _Tp* __last, const _Up& __value, _Proj&) {
-  if (auto __ret = std::__constexpr_wmemchr(__first, __value, __last - __first))
-    return __ret;
-  return __last;
+  if constexpr (sizeof(_Tp) == 1) {
+    if (auto __ret = std::__constexpr_memchr(__first, __value, __last - __first))
+      return __ret;
+    return __last;
+  }
+#  if _LIBCPP_HAS_WIDE_CHARACTERS
+  else if constexpr (sizeof(_Tp) == sizeof(wchar_t) && _LIBCPP_ALIGNOF(_Tp) >= _LIBCPP_ALIGNOF(wchar_t)) {
+    if (auto __ret = std::__constexpr_wmemchr(__first, __value, __last - __first))
+      return __ret;
+    return __last;
+  }
+#  endif
+#  if _LIBCPP_VECTORIZE_ALGORITHMS
+  else if constexpr (is_integral<_Tp>::value) {
+    return std::__find_vectorized(__first, __last, __value);
+  }
+#  endif
+  else {
+    __identity __proj;
+    return std::__find_loop(__first, __last, __value, __proj);
+  }
 }
-#endif // _LIBCPP_HAS_WIDE_CHARACTERS
+#endif
 
 // TODO: This should also be possible to get right with different signedness
 // cast integral types to allow vectorization
diff --git a/libcxx/include/__algorithm/simd_utils.h b/libcxx/include/__algorithm/simd_utils.h
index 96b074c..aaeb8a8 100644
--- a/libcxx/include/__algorithm/simd_utils.h
+++ b/libcxx/include/__algorithm/simd_utils.h
@@ -115,6 +115,11 @@ template <class _VecT, class _Iter>
 }
 
 template <class _Tp, size_t _Np>
+[[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI bool __any_of(__simd_vector<_Tp, _Np> __vec) noexcept {
+  return __builtin_reduce_or(__builtin_convertvector(__vec, __simd_vector<bool, _Np>));
+}
+
+template <class _Tp, size_t _Np>
 [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI bool __all_of(__simd_vector<_Tp, _Np> __vec) noexcept {
   return __builtin_reduce_and(__builtin_convertvector(__vec, __simd_vector<bool, _Np>));
 }
diff --git a/libcxx/include/__flat_map/flat_map.h b/libcxx/include/__flat_map/flat_map.h
index 31ba9bc..7bb235b 100644
--- a/libcxx/include/__flat_map/flat_map.h
+++ b/libcxx/include/__flat_map/flat_map.h
@@ -29,7 +29,6 @@
 #include <__flat_map/key_value_iterator.h>
 #include <__flat_map/sorted_unique.h>
 #include <__flat_map/utils.h>
-#include <__functional/invoke.h>
 #include <__functional/is_transparent.h>
 #include <__functional/operations.h>
 #include <__fwd/memory.h>
@@ -48,7 +47,6 @@
 #include <__ranges/container_compatible_range.h>
 #include <__ranges/drop_view.h>
 #include <__ranges/from_range.h>
-#include <__ranges/ref_view.h>
 #include <__ranges/size.h>
 #include <__ranges/subrange.h>
 #include <__ranges/zip_view.h>
diff --git a/libcxx/include/__flat_map/flat_multimap.h b/libcxx/include/__flat_map/flat_multimap.h
index abaacf9..96d9454 100644
--- a/libcxx/include/__flat_map/flat_multimap.h
+++ b/libcxx/include/__flat_map/flat_multimap.h
@@ -22,7 +22,6 @@
 #include <__algorithm/upper_bound.h>
 #include <__assert>
 #include <__compare/synth_three_way.h>
-#include <__concepts/convertible_to.h>
 #include <__concepts/swappable.h>
 #include <__config>
 #include <__cstddef/byte.h>
@@ -30,7 +29,6 @@
 #include <__flat_map/key_value_iterator.h>
 #include <__flat_map/sorted_equivalent.h>
 #include <__flat_map/utils.h>
-#include <__functional/invoke.h>
 #include <__functional/is_transparent.h>
 #include <__functional/operations.h>
 #include <__fwd/vector.h>
@@ -47,7 +45,6 @@
 #include <__ranges/container_compatible_range.h>
 #include <__ranges/drop_view.h>
 #include <__ranges/from_range.h>
-#include <__ranges/ref_view.h>
 #include <__ranges/size.h>
 #include <__ranges/subrange.h>
 #include <__ranges/zip_view.h>
@@ -57,14 +54,12 @@
 #include <__type_traits/is_allocator.h>
 #include <__type_traits/is_nothrow_constructible.h>
 #include <__type_traits/is_same.h>
-#include <__type_traits/maybe_const.h>
 #include <__utility/exception_guard.h>
 #include <__utility/move.h>
 #include <__utility/pair.h>
 #include <__utility/scope_guard.h>
 #include <__vector/vector.h>
 #include <initializer_list>
-#include <stdexcept>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
diff --git a/libcxx/include/__flat_map/key_value_iterator.h b/libcxx/include/__flat_map/key_value_iterator.h
index d04a23d..795651a 100644
--- a/libcxx/include/__flat_map/key_value_iterator.h
+++ b/libcxx/include/__flat_map/key_value_iterator.h
@@ -20,7 +20,6 @@
 #include <__type_traits/conditional.h>
 #include <__utility/forward.h>
 #include <__utility/move.h>
-#include <__utility/pair.h>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
diff --git a/libcxx/include/__flat_set/flat_multiset.h b/libcxx/include/__flat_set/flat_multiset.h
index 65f4161..b1a4917 100644
--- a/libcxx/include/__flat_set/flat_multiset.h
+++ b/libcxx/include/__flat_set/flat_multiset.h
@@ -13,54 +13,40 @@
 #include <__algorithm/equal_range.h>
 #include <__algorithm/lexicographical_compare_three_way.h>
 #include <__algorithm/lower_bound.h>
-#include <__algorithm/min.h>
 #include <__algorithm/ranges_equal.h>
 #include <__algorithm/ranges_inplace_merge.h>
 #include <__algorithm/ranges_is_sorted.h>
 #include <__algorithm/ranges_sort.h>
-#include <__algorithm/ranges_unique.h>
 #include <__algorithm/remove_if.h>
 #include <__algorithm/upper_bound.h>
 #include <__assert>
 #include <__compare/synth_three_way.h>
-#include <__concepts/convertible_to.h>
 #include <__concepts/swappable.h>
 #include <__config>
-#include <__cstddef/byte.h>
-#include <__cstddef/ptrdiff_t.h>
-#include <__flat_map/key_value_iterator.h>
 #include <__flat_map/sorted_equivalent.h>
 #include <__flat_set/ra_iterator.h>
 #include <__flat_set/utils.h>
-#include <__functional/invoke.h>
 #include <__functional/is_transparent.h>
 #include <__functional/operations.h>
 #include <__fwd/vector.h>
 #include <__iterator/concepts.h>
-#include <__iterator/distance.h>
 #include <__iterator/iterator_traits.h>
 #include <__iterator/prev.h>
-#include <__iterator/ranges_iterator_traits.h>
 #include <__iterator/reverse_iterator.h>
 #include <__memory/allocator_traits.h>
 #include <__memory/uses_allocator.h>
 #include <__memory/uses_allocator_construction.h>
-#include <__ranges/access.h>
 #include <__ranges/concepts.h>
 #include <__ranges/container_compatible_range.h>
 #include <__ranges/drop_view.h>
 #include <__ranges/from_range.h>
-#include <__ranges/ref_view.h>
 #include <__ranges/size.h>
 #include <__ranges/subrange.h>
-#include <__ranges/zip_view.h>
-#include <__type_traits/conjunction.h>
 #include <__type_traits/container_traits.h>
 #include <__type_traits/invoke.h>
 #include <__type_traits/is_allocator.h>
 #include <__type_traits/is_nothrow_constructible.h>
 #include <__type_traits/is_same.h>
-#include <__type_traits/maybe_const.h>
 #include <__utility/as_const.h>
 #include <__utility/exception_guard.h>
 #include <__utility/move.h>
diff --git a/libcxx/include/__flat_set/flat_set.h b/libcxx/include/__flat_set/flat_set.h
index cc788bd..5fa1f2d 100644
--- a/libcxx/include/__flat_set/flat_set.h
+++ b/libcxx/include/__flat_set/flat_set.h
@@ -12,7 +12,6 @@
 
 #include <__algorithm/lexicographical_compare_three_way.h>
 #include <__algorithm/lower_bound.h>
-#include <__algorithm/min.h>
 #include <__algorithm/ranges_adjacent_find.h>
 #include <__algorithm/ranges_equal.h>
 #include <__algorithm/ranges_inplace_merge.h>
@@ -24,20 +23,16 @@
 #include <__compare/synth_three_way.h>
 #include <__concepts/swappable.h>
 #include <__config>
-#include <__cstddef/ptrdiff_t.h>
 #include <__flat_map/sorted_unique.h>
 #include <__flat_set/ra_iterator.h>
 #include <__flat_set/utils.h>
-#include <__functional/invoke.h>
 #include <__functional/is_transparent.h>
 #include <__functional/operations.h>
 #include <__fwd/vector.h>
 #include <__iterator/concepts.h>
-#include <__iterator/distance.h>
 #include <__iterator/iterator_traits.h>
 #include <__iterator/next.h>
 #include <__iterator/prev.h>
-#include <__iterator/ranges_iterator_traits.h>
 #include <__iterator/reverse_iterator.h>
 #include <__memory/allocator_traits.h>
 #include <__memory/uses_allocator.h>
@@ -47,10 +42,7 @@
 #include <__ranges/container_compatible_range.h>
 #include <__ranges/drop_view.h>
 #include <__ranges/from_range.h>
-#include <__ranges/ref_view.h>
 #include <__ranges/size.h>
-#include <__ranges/subrange.h>
-#include <__type_traits/conjunction.h>
 #include <__type_traits/container_traits.h>
 #include <__type_traits/invoke.h>
 #include <__type_traits/is_allocator.h>
diff --git a/libcxx/include/module.modulemap.in b/libcxx/include/module.modulemap.in
index dc19333..93d43f8 100644
--- a/libcxx/include/module.modulemap.in
+++ b/libcxx/include/module.modulemap.in
@@ -1225,6 +1225,7 @@ module std [system] {
     header "deque"
     export *
     export std.iterator.reverse_iterator
+    export std.algorithm.simd_utils // This is a workaround for https://llvm.org/PR120108.
   }
 
   module exception {
@@ -1847,7 +1848,10 @@ module std [system] {
 
   module ranges {
     module access                         { header "__ranges/access.h" }
-    module all                            { header "__ranges/all.h" }
+    module all                            {
+      header "__ranges/all.h"
+      export std.ranges.ref_view
+    }
     module as_rvalue_view                 { header "__ranges/as_rvalue_view.h" }
     module chunk_by_view {
       header "__ranges/chunk_by_view.h"
@@ -2235,6 +2239,7 @@ module std [system] {
     header "vector"
     export std.iterator.reverse_iterator
     export *
+    export std.algorithm.simd_utils // This is a workaround for https://llvm.org/PR120108.
   }
 
   // Experimental C++ Standard Library interfaces
diff --git a/libcxx/test/benchmarks/algorithms/nonmodifying/find.bench.cpp b/libcxx/test/benchmarks/algorithms/nonmodifying/find.bench.cpp
index b2ead1c..afea31f 100644
--- a/libcxx/test/benchmarks/algorithms/nonmodifying/find.bench.cpp
+++ b/libcxx/test/benchmarks/algorithms/nonmodifying/find.bench.cpp
@@ -51,6 +51,7 @@ int main(int argc, char** argv) {
     // find
     bm.template operator()<std::vector<char>>("std::find(vector<char>) (" + comment + ")", std_find);
     bm.template operator()<std::vector<int>>("std::find(vector<int>) (" + comment + ")", std_find);
+    bm.template operator()<std::vector<long long>>("std::find(vector<long long>) (" + comment + ")", std_find);
     bm.template operator()<std::deque<int>>("std::find(deque<int>) (" + comment + ")", std_find);
     bm.template operator()<std::list<int>>("std::find(list<int>) (" + comment + ")", std_find);
 
diff --git a/lld/ELF/InputSection.cpp b/lld/ELF/InputSection.cpp
index 98267d1e..1270f27 100644
--- a/lld/ELF/InputSection.cpp
+++ b/lld/ELF/InputSection.cpp
@@ -1358,20 +1358,21 @@ SyntheticSection *EhInputSection::getParent() const {
 // .eh_frame is a sequence of CIE or FDE records.
 // This function splits an input section into records and returns them.
 template <class ELFT> void EhInputSection::split() {
-  const RelsOrRelas<ELFT> rels = relsOrRelas<ELFT>(/*supportsCrel=*/false);
-  // getReloc expects the relocations to be sorted by r_offset. See the comment
-  // in scanRelocs.
-  if (rels.areRelocsRel()) {
-    SmallVector<typename ELFT::Rel, 0> storage;
-    split<ELFT>(sortRels(rels.rels, storage));
-  } else {
-    SmallVector<typename ELFT::Rela, 0> storage;
-    split<ELFT>(sortRels(rels.relas, storage));
-  }
-}
+  const RelsOrRelas<ELFT> elfRels = relsOrRelas<ELFT>();
+  if (elfRels.areRelocsCrel())
+    preprocessRelocs<ELFT>(elfRels.crels);
+  else if (elfRels.areRelocsRel())
+    preprocessRelocs<ELFT>(elfRels.rels);
+  else
+    preprocessRelocs<ELFT>(elfRels.relas);
+
+  // The loop below expects the relocations to be sorted by offset.
+  auto cmp = [](const Relocation &a, const Relocation &b) {
+    return a.offset < b.offset;
+  };
+  if (!llvm::is_sorted(rels, cmp))
+    llvm::stable_sort(rels, cmp);
 
-template <class ELFT, class RelTy>
-void EhInputSection::split(ArrayRef<RelTy> rels) {
   ArrayRef<uint8_t> d = content();
   const char *msg = nullptr;
   unsigned relI = 0;
@@ -1397,10 +1398,10 @@ void EhInputSection::split(ArrayRef<RelTy> rels) {
     // Find the first relocation that points to [off,off+size). Relocations
     // have been sorted by r_offset.
     const uint64_t off = d.data() - content().data();
-    while (relI != rels.size() && rels[relI].r_offset < off)
+    while (relI != rels.size() && rels[relI].offset < off)
       ++relI;
     unsigned firstRel = -1;
-    if (relI != rels.size() && rels[relI].r_offset < off + size)
+    if (relI != rels.size() && rels[relI].offset < off + size)
       firstRel = relI;
     (id == 0 ? cies : fdes).emplace_back(off, this, size, firstRel);
     d = d.slice(size);
@@ -1410,6 +1411,23 @@ void EhInputSection::split(ArrayRef<RelTy> rels) {
                    << getObjMsg(d.data() - content().data());
 }
 
+template <class ELFT, class RelTy>
+void EhInputSection::preprocessRelocs(Relocs<RelTy> elfRels) {
+  Ctx &ctx = file->ctx;
+  rels.reserve(elfRels.size());
+  for (auto rel : elfRels) {
+    uint64_t offset = rel.r_offset;
+    Symbol &sym = file->getSymbol(rel.getSymbol(ctx.arg.isMips64EL));
+    RelType type = rel.getType(ctx.arg.isMips64EL);
+    RelExpr expr = ctx.target->getRelExpr(type, sym, content().data() + offset);
+    int64_t addend =
+        RelTy::HasAddend
+            ? getAddend<ELFT>(rel)
+            : ctx.target->getImplicitAddend(content().data() + offset, type);
+    rels.push_back({expr, type, offset, addend, &sym});
+  }
+}
+
 // Return the offset in an output section for a given input offset.
 uint64_t EhInputSection::getParentOffset(uint64_t offset) const {
   auto it = partition_point(
diff --git a/lld/ELF/InputSection.h b/lld/ELF/InputSection.h
index 8462f03..dc29fed 100644
--- a/lld/ELF/InputSection.h
+++ b/lld/ELF/InputSection.h
@@ -394,7 +394,7 @@ public:
                  StringRef name);
   static bool classof(const SectionBase *s) { return s->kind() == EHFrame; }
   template <class ELFT> void split();
-  template <class ELFT, class RelTy> void split(ArrayRef<RelTy> rels);
+  template <class ELFT, class RelTy> void preprocessRelocs(Relocs<RelTy> rels);
 
   // Splittable sections are handled as a sequence of data
   // rather than a single large blob of data.
@@ -402,6 +402,10 @@ public:
 
   SyntheticSection *getParent() const;
   uint64_t getParentOffset(uint64_t offset) const;
+
+  // Preprocessed relocations in uniform format to avoid REL/RELA/CREL
+  // relocation format handling throughout the codebase.
+  SmallVector<Relocation, 0> rels;
 };
 
 // This is a section that is added directly to an output section
diff --git a/lld/ELF/MarkLive.cpp b/lld/ELF/MarkLive.cpp
index 83ae9fb..a7b0f08 100644
--- a/lld/ELF/MarkLive.cpp
+++ b/lld/ELF/MarkLive.cpp
@@ -68,10 +68,9 @@ private:
   void mark();
 
   template <class RelTy>
-  void resolveReloc(InputSectionBase &sec, RelTy &rel, bool fromFDE);
+  void resolveReloc(InputSectionBase &sec, const RelTy &rel, bool fromFDE);
 
-  template <class RelTy>
-  void scanEhFrameSection(EhInputSection &eh, ArrayRef<RelTy> rels);
+  void scanEhFrameSection(EhInputSection &eh);
 
   Ctx &ctx;
   // The index of the partition that we are currently processing.
@@ -115,23 +114,38 @@ static uint64_t getAddend(Ctx &, InputSectionBase &sec,
 template <class ELFT, bool TrackWhyLive>
 template <class RelTy>
 void MarkLive<ELFT, TrackWhyLive>::resolveReloc(InputSectionBase &sec,
-                                                RelTy &rel, bool fromFDE) {
+                                                const RelTy &rel,
+                                                bool fromFDE) {
   // If a symbol is referenced in a live section, it is used.
-  Symbol &sym = sec.file->getRelocTargetSym(rel);
-  sym.used = true;
+  Symbol *sym;
+  if constexpr (std::is_same_v<RelTy, Relocation>) {
+    assert(isa<EhInputSection>(sec));
+    sym = rel.sym;
+  } else {
+    sym = &sec.file->getRelocTargetSym(rel);
+  }
+  sym->used = true;
 
   LiveReason reason;
-  if (TrackWhyLive)
-    reason = {SecOffset(&sec, rel.r_offset), "referenced by"};
+  if (TrackWhyLive) {
+    if constexpr (std::is_same_v<RelTy, Relocation>)
+      reason = {SecOffset(&sec, rel.offset), "referenced by"};
+    else
+      reason = {SecOffset(&sec, rel.r_offset), "referenced by"};
+  }
 
-  if (auto *d = dyn_cast<Defined>(&sym)) {
+  if (auto *d = dyn_cast<Defined>(sym)) {
     auto *relSec = dyn_cast_or_null<InputSectionBase>(d->section);
     if (!relSec)
       return;
 
     uint64_t offset = d->value;
-    if (d->isSection())
-      offset += getAddend<ELFT>(ctx, sec, rel);
+    if (d->isSection()) {
+      if constexpr (std::is_same_v<RelTy, Relocation>)
+        offset += rel.addend;
+      else
+        offset += getAddend<ELFT>(ctx, sec, rel);
+    }
 
     // fromFDE being true means this is referenced by a FDE in a .eh_frame
     // piece. The relocation points to the described function or to a LSDA. We
@@ -141,8 +155,9 @@ void MarkLive<ELFT, TrackWhyLive>::resolveReloc(InputSectionBase &sec,
     // associated text section is live, the LSDA will be retained due to section
     // group/SHF_LINK_ORDER rules (b) if the associated text section should be
     // discarded, marking the LSDA will unnecessarily retain the text section.
-    if (!(fromFDE && ((relSec->flags & (SHF_EXECINSTR | SHF_LINK_ORDER)) ||
-                      relSec->nextInSectionGroup))) {
+    if (!(fromFDE && std::is_same_v<RelTy, Relocation> &&
+          ((relSec->flags & (SHF_EXECINSTR | SHF_LINK_ORDER)) ||
+           relSec->nextInSectionGroup))) {
       Symbol *canonicalSym = d;
       if (TrackWhyLive && d->isSection()) {
         // This is expensive, so ideally this would be deferred until it's known
@@ -159,15 +174,15 @@ void MarkLive<ELFT, TrackWhyLive>::resolveReloc(InputSectionBase &sec,
     return;
   }
 
-  if (auto *ss = dyn_cast<SharedSymbol>(&sym)) {
+  if (auto *ss = dyn_cast<SharedSymbol>(sym)) {
     if (!ss->isWeak()) {
       cast<SharedFile>(ss->file)->isNeeded = true;
       if (TrackWhyLive)
-        whyLive.try_emplace(&sym, reason);
+        whyLive.try_emplace(sym, reason);
     }
   }
 
-  for (InputSectionBase *sec : cNamedSections.lookup(sym.getName()))
+  for (InputSectionBase *sec : cNamedSections.lookup(sym->getName()))
     enqueue(sec, /*offset=*/0, /*sym=*/nullptr, reason);
 }
 
@@ -186,9 +201,8 @@ void MarkLive<ELFT, TrackWhyLive>::resolveReloc(InputSectionBase &sec,
 // the gc pass. With that we would be able to also gc some sections holding
 // LSDAs and personality functions if we found that they were unused.
 template <class ELFT, bool TrackWhyLive>
-template <class RelTy>
-void MarkLive<ELFT, TrackWhyLive>::scanEhFrameSection(EhInputSection &eh,
-                                                      ArrayRef<RelTy> rels) {
+void MarkLive<ELFT, TrackWhyLive>::scanEhFrameSection(EhInputSection &eh) {
+  ArrayRef<Relocation> rels = eh.rels;
   for (const EhSectionPiece &cie : eh.cies)
     if (cie.firstRelocation != unsigned(-1))
       resolveReloc(eh, rels[cie.firstRelocation], false);
@@ -198,7 +212,7 @@ void MarkLive<ELFT, TrackWhyLive>::scanEhFrameSection(EhInputSection &eh,
       continue;
     uint64_t pieceEnd = fde.inputOff + fde.size;
     for (size_t j = firstRelI, end2 = rels.size();
-         j < end2 && rels[j].r_offset < pieceEnd; ++j)
+         j < end2 && rels[j].offset < pieceEnd; ++j)
       resolveReloc(eh, rels[j], true);
   }
 }
@@ -360,14 +374,8 @@ void MarkLive<ELFT, TrackWhyLive>::run() {
   // that point to .eh_frames. Otherwise, the garbage collector would drop
   // all of them. We also want to preserve personality routines and LSDA
   // referenced by .eh_frame sections, so we scan them for that here.
-  for (EhInputSection *eh : ctx.ehInputSections) {
-    const RelsOrRelas<ELFT> rels =
-        eh->template relsOrRelas<ELFT>(/*supportsCrel=*/false);
-    if (rels.areRelocsRel())
-      scanEhFrameSection(*eh, rels.rels);
-    else if (rels.relas.size())
-      scanEhFrameSection(*eh, rels.relas);
-  }
+  for (EhInputSection *eh : ctx.ehInputSections)
+    scanEhFrameSection(*eh);
   for (InputSectionBase *sec : ctx.inputSections) {
     if (sec->flags & SHF_GNU_RETAIN) {
       enqueue(sec, /*offset=*/0, /*sym=*/nullptr, {std::nullopt, "retained"});
diff --git a/lld/ELF/SyntheticSections.cpp b/lld/ELF/SyntheticSections.cpp
index 457a794..bbf4b29 100644
--- a/lld/ELF/SyntheticSections.cpp
+++ b/lld/ELF/SyntheticSections.cpp
@@ -403,12 +403,12 @@ EhFrameSection::EhFrameSection(Ctx &ctx)
 // Search for an existing CIE record or create a new one.
 // CIE records from input object files are uniquified by their contents
 // and where their relocations point to.
-template <class RelTy>
-CieRecord *EhFrameSection::addCie(EhSectionPiece &cie, ArrayRef<RelTy> rels) {
+CieRecord *EhFrameSection::addCie(EhSectionPiece &cie,
+                                  ArrayRef<Relocation> rels) {
   Symbol *personality = nullptr;
   unsigned firstRelI = cie.firstRelocation;
   if (firstRelI != (unsigned)-1)
-    personality = &cie.sec->file->getRelocTargetSym(rels[firstRelI]);
+    personality = rels[firstRelI].sym;
 
   // Search for an existing CIE by CIE contents/relocation target pair.
   CieRecord *&rec = cieMap[{cie.data(), personality}];
@@ -424,25 +424,20 @@ CieRecord *EhFrameSection::addCie(EhSectionPiece &cie, ArrayRef<RelTy> rels) {
 
 // There is one FDE per function. Returns a non-null pointer to the function
 // symbol if the given FDE points to a live function.
-template <class RelTy>
-Defined *EhFrameSection::isFdeLive(EhSectionPiece &fde, ArrayRef<RelTy> rels) {
-  auto *sec = cast<EhInputSection>(fde.sec);
-  unsigned firstRelI = fde.firstRelocation;
-
+Defined *EhFrameSection::isFdeLive(EhSectionPiece &fde,
+                                   ArrayRef<Relocation> rels) {
   // An FDE should point to some function because FDEs are to describe
   // functions. That's however not always the case due to an issue of
   // ld.gold with -r. ld.gold may discard only functions and leave their
   // corresponding FDEs, which results in creating bad .eh_frame sections.
   // To deal with that, we ignore such FDEs.
+  unsigned firstRelI = fde.firstRelocation;
   if (firstRelI == (unsigned)-1)
     return nullptr;
 
-  const RelTy &rel = rels[firstRelI];
-  Symbol &b = sec->file->getRelocTargetSym(rel);
-
   // FDEs for garbage-collected or merged-by-ICF sections, or sections in
   // another partition, are dead.
-  if (auto *d = dyn_cast<Defined>(&b))
+  if (auto *d = dyn_cast<Defined>(rels[firstRelI].sym))
     if (!d->folded && d->section && d->section->partition == partition)
       return d;
   return nullptr;
@@ -452,13 +447,13 @@ Defined *EhFrameSection::isFdeLive(EhSectionPiece &fde, ArrayRef<RelTy> rels) {
 // is one CIE record per input object file which is followed by
 // a list of FDEs. This function searches an existing CIE or create a new
 // one and associates FDEs to the CIE.
-template <class ELFT, class RelTy>
-void EhFrameSection::addRecords(EhInputSection *sec, ArrayRef<RelTy> rels) {
+template <endianness e> void EhFrameSection::addRecords(EhInputSection *sec) {
+  auto rels = sec->rels;
   offsetToCie.clear();
   for (EhSectionPiece &cie : sec->cies)
-    offsetToCie[cie.inputOff] = addCie<RelTy>(cie, rels);
+    offsetToCie[cie.inputOff] = addCie(cie, rels);
   for (EhSectionPiece &fde : sec->fdes) {
-    uint32_t id = endian::read32<ELFT::Endianness>(fde.data().data() + 4);
+    uint32_t id = endian::read32<e>(fde.data().data() + 4);
     CieRecord *rec = offsetToCie[fde.inputOff + 4 - id];
     if (!rec)
       Fatal(ctx) << sec << ": invalid CIE reference";
@@ -470,23 +465,11 @@ void EhFrameSection::addRecords(EhInputSection *sec, ArrayRef<RelTy> rels) {
   }
 }
 
-template <class ELFT>
-void EhFrameSection::addSectionAux(EhInputSection *sec) {
-  if (!sec->isLive())
-    return;
-  const RelsOrRelas<ELFT> rels =
-      sec->template relsOrRelas<ELFT>(/*supportsCrel=*/false);
-  if (rels.areRelocsRel())
-    addRecords<ELFT>(sec, rels.rels);
-  else
-    addRecords<ELFT>(sec, rels.relas);
-}
-
 // Used by ICF<ELFT>::handleLSDA(). This function is very similar to
 // EhFrameSection::addRecords().
-template <class ELFT, class RelTy>
+template <class ELFT>
 void EhFrameSection::iterateFDEWithLSDAAux(
-    EhInputSection &sec, ArrayRef<RelTy> rels, DenseSet<size_t> &ciesWithLSDA,
+    EhInputSection &sec, DenseSet<size_t> &ciesWithLSDA,
     llvm::function_ref<void(InputSection &)> fn) {
   for (EhSectionPiece &cie : sec.cies)
     if (hasLSDA(cie))
@@ -497,7 +480,7 @@ void EhFrameSection::iterateFDEWithLSDAAux(
       continue;
 
     // The CIE has a LSDA argument. Call fn with d's section.
-    if (Defined *d = isFdeLive(fde, rels))
+    if (Defined *d = isFdeLive(fde, sec.rels))
       if (auto *s = dyn_cast_or_null<InputSection>(d->section))
         fn(*s);
   }
@@ -509,12 +492,7 @@ void EhFrameSection::iterateFDEWithLSDA(
   DenseSet<size_t> ciesWithLSDA;
   for (EhInputSection *sec : sections) {
     ciesWithLSDA.clear();
-    const RelsOrRelas<ELFT> rels =
-        sec->template relsOrRelas<ELFT>(/*supportsCrel=*/false);
-    if (rels.areRelocsRel())
-      iterateFDEWithLSDAAux<ELFT>(*sec, rels.rels, ciesWithLSDA, fn);
-    else
-      iterateFDEWithLSDAAux<ELFT>(*sec, rels.relas, ciesWithLSDA, fn);
+    iterateFDEWithLSDAAux<ELFT>(*sec, ciesWithLSDA, fn);
   }
 }
 
@@ -531,20 +509,16 @@ void EhFrameSection::finalizeContents() {
   case ELFNoneKind:
     llvm_unreachable("invalid ekind");
   case ELF32LEKind:
-    for (EhInputSection *sec : sections)
-      addSectionAux<ELF32LE>(sec);
-    break;
-  case ELF32BEKind:
-    for (EhInputSection *sec : sections)
-      addSectionAux<ELF32BE>(sec);
-    break;
   case ELF64LEKind:
     for (EhInputSection *sec : sections)
-      addSectionAux<ELF64LE>(sec);
+      if (sec->isLive())
+        addRecords<endianness::little>(sec);
     break;
+  case ELF32BEKind:
   case ELF64BEKind:
     for (EhInputSection *sec : sections)
-      addSectionAux<ELF64BE>(sec);
+      if (sec->isLive())
+        addRecords<endianness::big>(sec);
     break;
   }
 
diff --git a/lld/ELF/SyntheticSections.h b/lld/ELF/SyntheticSections.h
index 55a1071..ac3ec63 100644
--- a/lld/ELF/SyntheticSections.h
+++ b/lld/ELF/SyntheticSections.h
@@ -80,19 +80,14 @@ private:
 
   uint64_t size = 0;
 
-  template <class ELFT, class RelTy>
-  void addRecords(EhInputSection *s, llvm::ArrayRef<RelTy> rels);
-  template <class ELFT> void addSectionAux(EhInputSection *s);
-  template <class ELFT, class RelTy>
-  void iterateFDEWithLSDAAux(EhInputSection &sec, ArrayRef<RelTy> rels,
+  template <llvm::endianness E> void addRecords(EhInputSection *s);
+  template <class ELFT>
+  void iterateFDEWithLSDAAux(EhInputSection &sec,
                              llvm::DenseSet<size_t> &ciesWithLSDA,
                              llvm::function_ref<void(InputSection &)> fn);
 
-  template <class RelTy>
-  CieRecord *addCie(EhSectionPiece &piece, ArrayRef<RelTy> rels);
-
-  template <class RelTy>
-  Defined *isFdeLive(EhSectionPiece &piece, ArrayRef<RelTy> rels);
+  CieRecord *addCie(EhSectionPiece &piece, ArrayRef<Relocation> rels);
+  Defined *isFdeLive(EhSectionPiece &piece, ArrayRef<Relocation> rels);
 
   uint64_t getFdePc(uint8_t *buf, size_t off, uint8_t enc) const;
 
diff --git a/lld/docs/ReleaseNotes.rst b/lld/docs/ReleaseNotes.rst
index 6ea1ea0..566dde6 100644
--- a/lld/docs/ReleaseNotes.rst
+++ b/lld/docs/ReleaseNotes.rst
@@ -44,6 +44,9 @@ MinGW Improvements
 MachO Improvements
 ------------------
 
+* ``--separate-cstring-literal-sections`` emits cstring literal sections into sections defined by their section name.
+  (`#158720 <https://github.com/llvm/llvm-project/pull/158720>`_)
+
 WebAssembly Improvements
 ------------------------
 
diff --git a/lld/test/ELF/eh-frame-relocation.s b/lld/test/ELF/eh-frame-relocation.s
new file mode 100644
index 0000000..9c1fe40
--- /dev/null
+++ b/lld/test/ELF/eh-frame-relocation.s
@@ -0,0 +1,29 @@
+# REQUIRES: x86
+## Test that marker relocations are ignored and undefined symbols lead to errors.
+
+# RUN: rm -rf %t && split-file %s %t && cd %t
+# RUN: llvm-mc -filetype=obj -triple=x86_64 a.s -o a.o
+# RUN: llvm-mc -filetype=obj -triple=x86_64 abi.s -o abi.o
+# RUN: ld.lld a.o abi.o -o a
+# RUN: llvm-readelf -s a | FileCheck %s
+
+# CHECK: 00000000002{{.*}} 0 FUNC    GLOBAL DEFAULT [[#]] __gxx_personality_v0
+
+# RUN: not ld.lld a.o 2>&1 | FileCheck %s --check-prefix=ERR
+
+# ERR:      error: undefined symbol: __gxx_personality_v0
+# ERR-NEXT: >>> referenced by a.o:(.eh_frame+0x12)
+
+#--- a.s
+.cfi_startproc
+.cfi_personality 0, __gxx_personality_v0
+  ret
+.cfi_endproc
+
+.section .eh_frame,"a",@unwind
+.reloc ., BFD_RELOC_NONE, ignore
+
+#--- abi.s
+.globl __gxx_personality_v0
+.type __gxx_personality_v0, @function
+__gxx_personality_v0:
diff --git a/lld/test/wasm/archive-export.test b/lld/test/wasm/archive-export.test
index 9a76d60..c67e500 100644
--- a/lld/test/wasm/archive-export.test
+++ b/lld/test/wasm/archive-export.test
@@ -14,6 +14,9 @@ CHECK:         Exports:
 CHECK-NEXT:       - Name:            memory
 CHECK-NEXT:         Kind:            MEMORY
 CHECK-NEXT:         Index:           0
+CHECK-NEXT:       - Name:            __stack_pointer
+CHECK-NEXT:         Kind:            GLOBAL
+CHECK-NEXT:         Index:           0
 CHECK-NEXT:       - Name:            foo
 CHECK-NEXT:         Kind:            FUNCTION
 CHECK-NEXT:         Index:           1
diff --git a/lld/test/wasm/comdats.ll b/lld/test/wasm/comdats.ll
index 8fc301e..2dd687f 100644
--- a/lld/test/wasm/comdats.ll
+++ b/lld/test/wasm/comdats.ll
@@ -35,6 +35,9 @@ entry:
 ; CHECK-NEXT:      - Name:            memory
 ; CHECK-NEXT:        Kind:            MEMORY
 ; CHECK-NEXT:        Index:           0
+; CHECK-NEXT:      - Name:            __stack_pointer
+; CHECK-NEXT:        Kind:            GLOBAL
+; CHECK-NEXT:        Index:           0
 ; CHECK-NEXT:      - Name:            _start
 ; CHECK-NEXT:        Kind:            FUNCTION
 ; CHECK-NEXT:        Index:           1
diff --git a/lld/test/wasm/visibility-hidden.ll b/lld/test/wasm/visibility-hidden.ll
index 36c29a8e..6ed7ba3 100644
--- a/lld/test/wasm/visibility-hidden.ll
+++ b/lld/test/wasm/visibility-hidden.ll
@@ -43,6 +43,9 @@ entry:
 ; CHECK-NEXT:       - Name:            memory
 ; CHECK-NEXT:         Kind:            MEMORY
 ; CHECK-NEXT:         Index:           0
+; CHECK-NEXT:       - Name:            __stack_pointer
+; CHECK-NEXT:         Kind:            GLOBAL
+; CHECK-NEXT:         Index:           0
 ; CHECK-NEXT:       - Name:            objectDefault
 ; CHECK-NEXT:         Kind:            FUNCTION
 ; CHECK-NEXT:         Index:           1
diff --git a/lld/wasm/Driver.cpp b/lld/wasm/Driver.cpp
index 9b85b6c..46c848d 100644
--- a/lld/wasm/Driver.cpp
+++ b/lld/wasm/Driver.cpp
@@ -914,9 +914,10 @@ static InputGlobal *createGlobal(StringRef name, bool isMutable) {
   return make<InputGlobal>(wasmGlobal, nullptr);
 }
 
-static GlobalSymbol *createGlobalVariable(StringRef name, bool isMutable) {
+static GlobalSymbol *createGlobalVariable(StringRef name, bool isMutable,
+                                          uint32_t flags = 0) {
   InputGlobal *g = createGlobal(name, isMutable);
-  return symtab->addSyntheticGlobal(name, WASM_SYMBOL_VISIBILITY_HIDDEN, g);
+  return symtab->addSyntheticGlobal(name, flags, g);
 }
 
 static GlobalSymbol *createOptionalGlobal(StringRef name, bool isMutable) {
@@ -966,9 +967,13 @@ static void createSyntheticSymbols() {
   }
 
   if (ctx.arg.sharedMemory) {
-    ctx.sym.tlsBase = createGlobalVariable("__tls_base", true);
-    ctx.sym.tlsSize = createGlobalVariable("__tls_size", false);
-    ctx.sym.tlsAlign = createGlobalVariable("__tls_align", false);
+    // TLS symbols are all hidden/dso-local
+    ctx.sym.tlsBase =
+        createGlobalVariable("__tls_base", true, WASM_SYMBOL_VISIBILITY_HIDDEN);
+    ctx.sym.tlsSize = createGlobalVariable("__tls_size", false,
+                                           WASM_SYMBOL_VISIBILITY_HIDDEN);
+    ctx.sym.tlsAlign = createGlobalVariable("__tls_align", false,
+                                            WASM_SYMBOL_VISIBILITY_HIDDEN);
     ctx.sym.initTLS = symtab->addSyntheticFunction(
         "__wasm_init_tls", WASM_SYMBOL_VISIBILITY_HIDDEN,
         make<SyntheticFunction>(is64 ? i64ArgSignature : i32ArgSignature,
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp
index f1e73d7..82e9d86 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp
@@ -3126,43 +3126,6 @@ void DWARFASTParserClang::ParseSingleMember(
   if (!member_clang_type.IsCompleteType())
     member_clang_type.GetCompleteType();
 
-  {
-    // Older versions of clang emit the same DWARF for array[0] and array[1]. If
-    // the current field is at the end of the structure, then there is
-    // definitely no room for extra elements and we override the type to
-    // array[0]. This was fixed by f454dfb6b5af.
-    CompilerType member_array_element_type;
-    uint64_t member_array_size;
-    bool member_array_is_incomplete;
-
-    if (member_clang_type.IsArrayType(&member_array_element_type,
-                                      &member_array_size,
-                                      &member_array_is_incomplete) &&
-        !member_array_is_incomplete) {
-      uint64_t parent_byte_size =
-          parent_die.GetAttributeValueAsUnsigned(DW_AT_byte_size, UINT64_MAX);
-
-      // If the attrs.member_byte_offset is still set to UINT32_MAX this means
-      // that the DW_TAG_member didn't have a DW_AT_data_member_location, so
-      // don't emit an error if this is the case.
-      if (attrs.member_byte_offset != UINT32_MAX &&
-          attrs.member_byte_offset >= parent_byte_size) {
-        if (member_array_size != 1 &&
-            (member_array_size != 0 ||
-             attrs.member_byte_offset > parent_byte_size)) {
-          module_sp->ReportError(
-              "{0:x8}: DW_TAG_member '{1}' refers to type {2:x16}"
-              " which extends beyond the bounds of {3:x8}",
-              die.GetID(), attrs.name,
-              attrs.encoding_form.Reference().GetOffset(), parent_die.GetID());
-        }
-
-        member_clang_type =
-            m_ast.CreateArrayType(member_array_element_type, 0, false);
-      }
-    }
-  }
-
   TypeSystemClang::RequireCompleteType(member_clang_type);
 
   clang::FieldDecl *field_decl = TypeSystemClang::AddFieldToRecordType(
diff --git a/lldb/test/Shell/SymbolFile/DWARF/incomplete-member-beyond-parent-bounds.yaml b/lldb/test/Shell/SymbolFile/DWARF/incomplete-member-beyond-parent-bounds.yaml
new file mode 100644
index 0000000..4e659d0
--- /dev/null
+++ b/lldb/test/Shell/SymbolFile/DWARF/incomplete-member-beyond-parent-bounds.yaml
@@ -0,0 +1,104 @@
+# This is DWARF where we placed an incomplete type
+# at an offset that is the parent DW_AT_byte_size. Check
+# that we don't report an error in such cases.
+#
+# DW_TAG_compile_unit
+#   DW_AT_name        ("main.cpp")
+#   DW_AT_language    (DW_LANG_C)
+#
+#   DW_TAG_structure_type
+#     DW_AT_name      ("Incomplete")
+#     DW_AT_external  (true)
+#
+#   DW_TAG_structure_type
+#     DW_AT_name      ("Foo")
+#     DW_AT_byte_size (0x04)
+#
+#     DW_TAG_member
+#       DW_AT_name    ("mem")
+#       DW_AT_data_member_location ("0x04")
+#       DW_AT_type    (0x00000011 "Incomplete")
+#
+#     NULL
+#
+#   NULL
+
+# RUN: yaml2obj %s > %t
+# RUN: lldb-test symbols --name=Foo --find=type %t 2>&1 | FileCheck %s
+
+# CHECK:     Found 1 types:
+
+--- !ELF
+FileHeader:
+  Class:           ELFCLASS64
+  Data:            ELFDATA2LSB
+  Type:            ET_EXEC
+  Machine:         EM_X86_64
+DWARF:
+  debug_str:
+    - main.cpp
+    - Incomplete
+    - Foo
+    - mem
+  debug_abbrev:
+    - ID:              0
+      Table:
+        - Code:            0x1
+          Tag:             DW_TAG_compile_unit
+          Children:        DW_CHILDREN_yes
+          Attributes:
+            - Attribute:       DW_AT_name
+              Form:            DW_FORM_strp
+            - Attribute:       DW_AT_language
+              Form:            DW_FORM_udata
+        - Code:            0x2
+          Tag:             DW_TAG_structure_type
+          Children:        DW_CHILDREN_no
+          Attributes:
+            - Attribute:       DW_AT_name
+              Form:            DW_FORM_strp
+            - Attribute:       DW_AT_external
+              Form:            DW_FORM_flag_present
+        - Code:            0x3
+          Tag:             DW_TAG_structure_type
+          Children:        DW_CHILDREN_yes
+          Attributes:
+            - Attribute:       DW_AT_name
+              Form:            DW_FORM_strp
+            - Attribute:       DW_AT_byte_size
+              Form:            DW_FORM_data1
+        - Code:            0x4
+          Tag:             DW_TAG_member
+          Children:        DW_CHILDREN_no
+          Attributes:
+            - Attribute:       DW_AT_name
+              Form:            DW_FORM_strp
+            - Attribute:       DW_AT_type
+              Form:            DW_FORM_ref4
+            - Attribute:       DW_AT_data_member_location
+              Form:            DW_FORM_data1
+  debug_info:
+    - Version:         4
+      AbbrevTableID:   0
+      AbbrOffset:      0x0
+      AddrSize:        8
+      Entries:
+        - AbbrCode:        0x1
+          Values:
+            - Value:           0x0
+            - Value:           0x2
+        - AbbrCode:        0x2
+          Values:
+            - Value:           0x9
+        - AbbrCode:        0x3
+          Values:
+            - Value:           0x14
+            - Value:           0x04
+        - AbbrCode:        0x4
+          Values:
+            - Value:           0x18
+            - Value:           0x11
+            - Value:           0x04
+        - AbbrCode:        0x0
+        - AbbrCode:        0x0
+...
diff --git a/lldb/test/Shell/SymbolFile/DWARF/member-beyond-parent-bounds.yaml b/lldb/test/Shell/SymbolFile/DWARF/member-beyond-parent-bounds.yaml
new file mode 100644
index 0000000..2ac538e
--- /dev/null
+++ b/lldb/test/Shell/SymbolFile/DWARF/member-beyond-parent-bounds.yaml
@@ -0,0 +1,109 @@
+# This is malformed DWARF where we placed a non-zero sized type
+# at an offset that is larger the parent DW_AT_byte_size. Check
+# that we report an error in such cases.
+#
+# DW_TAG_compile_unit
+#   DW_AT_name        ("main.cpp")
+#   DW_AT_language    (DW_LANG_C)
+#
+#   DW_TAG_base_type
+#     DW_AT_name      ("int")
+#     DW_AT_encoding  (DW_ATE_signed)
+#     DW_AT_byte_size (0x04)
+#
+#   DW_TAG_structure_type
+#     DW_AT_name      ("Foo")
+#     DW_AT_byte_size (0x04)
+#
+#     DW_TAG_member
+#       DW_AT_name    ("mem")
+#       DW_AT_data_member_location ("0x05")
+#       DW_AT_type    (0x00000011 "int")
+#
+#     NULL
+#
+#   NULL
+
+# RUN: yaml2obj %s > %t
+# RUN: lldb-test symbols --name=Foo --find=type %t 2>&1 | FileCheck %s
+
+# CHECK: Found 1 types:
+
+--- !ELF
+FileHeader:
+  Class:           ELFCLASS64
+  Data:            ELFDATA2LSB
+  Type:            ET_EXEC
+  Machine:         EM_X86_64
+DWARF:
+  debug_str:
+    - main.cpp
+    - int
+    - Foo
+    - mem
+  debug_abbrev:
+    - ID:              0
+      Table:
+        - Code:            0x1
+          Tag:             DW_TAG_compile_unit
+          Children:        DW_CHILDREN_yes
+          Attributes:
+            - Attribute:       DW_AT_name
+              Form:            DW_FORM_strp
+            - Attribute:       DW_AT_language
+              Form:            DW_FORM_udata
+        - Code:            0x2
+          Tag:             DW_TAG_base_type
+          Children:        DW_CHILDREN_no
+          Attributes:
+            - Attribute:       DW_AT_name
+              Form:            DW_FORM_strp
+            - Attribute:       DW_AT_encoding
+              Form:            DW_FORM_data1
+            - Attribute:       DW_AT_byte_size
+              Form:            DW_FORM_data1
+        - Code:            0x3
+          Tag:             DW_TAG_structure_type
+          Children:        DW_CHILDREN_yes
+          Attributes:
+            - Attribute:       DW_AT_name
+              Form:            DW_FORM_strp
+            - Attribute:       DW_AT_byte_size
+              Form:            DW_FORM_data1
+        - Code:            0x4
+          Tag:             DW_TAG_member
+          Children:        DW_CHILDREN_no
+          Attributes:
+            - Attribute:       DW_AT_name
+              Form:            DW_FORM_strp
+            - Attribute:       DW_AT_type
+              Form:            DW_FORM_ref4
+            - Attribute:       DW_AT_data_member_location
+              Form:            DW_FORM_data1
+  debug_info:
+    - Version:         4
+      AbbrevTableID:   0
+      AbbrOffset:      0x0
+      AddrSize:        8
+      Entries:
+        - AbbrCode:        0x1
+          Values:
+            - Value:           0x0
+            - Value:           0x2
+        - AbbrCode:        0x2
+          Values:
+            - Value:           0x9
+            - Value:           0x5
+            - Value:           0x4
+        - AbbrCode:        0x3
+          Values:
+            - Value:           0x0d
+            - Value:           0x04
+        - AbbrCode:        0x4
+          Values:
+            - Value:           0x11
+            - Value:           0x11
+            - Value:           0x05
+        - AbbrCode:        0x0
+        - AbbrCode:        0x0
+...
diff --git a/lldb/test/Shell/SymbolFile/DWARF/member-on-parent-bounds.yaml b/lldb/test/Shell/SymbolFile/DWARF/member-on-parent-bounds.yaml
new file mode 100644
index 0000000..736697c
--- /dev/null
+++ b/lldb/test/Shell/SymbolFile/DWARF/member-on-parent-bounds.yaml
@@ -0,0 +1,109 @@
+# This is malformed DWARF where we placed a non-zero sized type
+# at an offset that is the parent DW_AT_byte_size. Check
+# that we report an error in such cases.
+#
+# DW_TAG_compile_unit
+#   DW_AT_name        ("main.cpp")
+#   DW_AT_language    (DW_LANG_C)
+#
+#   DW_TAG_base_type
+#     DW_AT_name      ("int")
+#     DW_AT_encoding  (DW_ATE_signed)
+#     DW_AT_byte_size (0x04)
+#
+#   DW_TAG_structure_type
+#     DW_AT_name      ("Foo")
+#     DW_AT_byte_size (0x04)
+#
+#     DW_TAG_member
+#       DW_AT_name    ("mem")
+#       DW_AT_data_member_location ("0x04")
+#       DW_AT_type    (0x00000011 "int")
+#
+#     NULL
+#
+#   NULL
+
+# RUN: yaml2obj %s > %t
+# RUN: lldb-test symbols --name=Foo --find=type %t 2>&1 | FileCheck %s
+
+# CHECK: Found 1 types:
+
+--- !ELF
+FileHeader:
+  Class:           ELFCLASS64
+  Data:            ELFDATA2LSB
+  Type:            ET_EXEC
+  Machine:         EM_X86_64
+DWARF:
+  debug_str:
+    - main.cpp
+    - int
+    - Foo
+    - mem
+  debug_abbrev:
+    - ID:              0
+      Table:
+        - Code:            0x1
+          Tag:             DW_TAG_compile_unit
+          Children:        DW_CHILDREN_yes
+          Attributes:
+            - Attribute:       DW_AT_name
+              Form:            DW_FORM_strp
+            - Attribute:       DW_AT_language
+              Form:            DW_FORM_udata
+        - Code:            0x2
+          Tag:             DW_TAG_base_type
+          Children:        DW_CHILDREN_no
+          Attributes:
+            - Attribute:       DW_AT_name
+              Form:            DW_FORM_strp
+            - Attribute:       DW_AT_encoding
+              Form:            DW_FORM_data1
+            - Attribute:       DW_AT_byte_size
+              Form:            DW_FORM_data1
+        - Code:            0x3
+          Tag:             DW_TAG_structure_type
+          Children:        DW_CHILDREN_yes
+          Attributes:
+            - Attribute:       DW_AT_name
+              Form:            DW_FORM_strp
+            - Attribute:       DW_AT_byte_size
+              Form:            DW_FORM_data1
+        - Code:            0x4
+          Tag:             DW_TAG_member
+          Children:        DW_CHILDREN_no
+          Attributes:
+            - Attribute:       DW_AT_name
+              Form:            DW_FORM_strp
+            - Attribute:       DW_AT_type
+              Form:            DW_FORM_ref4
+            - Attribute:       DW_AT_data_member_location
+              Form:            DW_FORM_data1
+  debug_info:
+    - Version:         4
+      AbbrevTableID:   0
+      AbbrOffset:      0x0
+      AddrSize:        8
+      Entries:
+        - AbbrCode:        0x1
+          Values:
+            - Value:           0x0
+            - Value:           0x2
+        - AbbrCode:        0x2
+          Values:
+            - Value:           0x9
+            - Value:           0x5
+            - Value:           0x4
+        - AbbrCode:        0x3
+          Values:
+            - Value:           0x0d
+            - Value:           0x04
+        - AbbrCode:        0x4
+          Values:
+            - Value:           0x11
+            - Value:           0x11
+            - Value:           0x04
+        - AbbrCode:        0x0
+        - AbbrCode:        0x0
+...
diff --git a/lldb/test/Shell/SymbolFile/DWARF/union-types-no-member-location.yaml b/lldb/test/Shell/SymbolFile/DWARF/union-types-no-member-location.yaml
index fbdc626..1d1e129 100644
--- a/lldb/test/Shell/SymbolFile/DWARF/union-types-no-member-location.yaml
+++ b/lldb/test/Shell/SymbolFile/DWARF/union-types-no-member-location.yaml
@@ -48,14 +48,10 @@
 # RUN: yaml2obj %s > %t
 # RUN: lldb-test symbols --name=UnionType --find=type %t > %t.stdout
 # RUN: cat %t.stdout | FileCheck --check-prefix=STDOUT %s
-# RUN: lldb-test symbols --name=UnionType --find=type %t 2> %t.stderr
-# RUN: cat %t.stderr | FileCheck --allow-empty --check-prefix=STDERR %s
 
 # STDOUT: Found 1 types:
 # STDOUT: {{(0x)?[0-9a-fA-F]+}}: Type{0x0000002b} , name = "UnionType", size = 32, compiler_type = 0x{{[0-9a-fA-F]+}} union UnionType {
 
-# STDERR-NOT: error: union-types-no-member-location.yaml.tmp 0x00000031: DW_TAG_member 'array' refers to type 0x000000000000001f which extends beyond the bounds of 0x0000002b
-
 --- !ELF
 FileHeader:
   Class:           ELFCLASS64
diff --git a/lldb/test/Shell/SymbolFile/DWARF/zero-sized-member-in-parent-bounds.yaml b/lldb/test/Shell/SymbolFile/DWARF/zero-sized-member-in-parent-bounds.yaml
new file mode 100644
index 0000000..a98f62c
--- /dev/null
+++ b/lldb/test/Shell/SymbolFile/DWARF/zero-sized-member-in-parent-bounds.yaml
@@ -0,0 +1,105 @@
+# This is DWARF where we placed a zero-sized type
+# at an offset that is the parent DW_AT_byte_size. Check
+# that we don't report an error in such cases.
+#
+# DW_TAG_compile_unit
+#   DW_AT_name        ("main.cpp")
+#   DW_AT_language    (DW_LANG_C)
+#
+#   DW_TAG_structure_type
+#     DW_AT_name      ("Bar")
+#     DW_AT_byte_size (0x00)
+#
+#   DW_TAG_structure_type
+#     DW_AT_name      ("Foo")
+#     DW_AT_byte_size (0x04)
+#
+#     DW_TAG_member
+#       DW_AT_name    ("mem")
+#       DW_AT_data_member_location ("0x04")
+#       DW_AT_type    (0x00000011 "Bar")
+#
+#     NULL
+#
+#   NULL
+
+# RUN: yaml2obj %s > %t
+# RUN: lldb-test symbols --name=Foo --find=type %t 2>&1 | FileCheck %s
+
+# CHECK:     Found 1 types:
+
+--- !ELF
+FileHeader:
+  Class:           ELFCLASS64
+  Data:            ELFDATA2LSB
+  Type:            ET_EXEC
+  Machine:         EM_X86_64
+DWARF:
+  debug_str:
+    - main.cpp
+    - Bar
+    - Foo
+    - mem
+  debug_abbrev:
+    - ID:              0
+      Table:
+        - Code:            0x1
+          Tag:             DW_TAG_compile_unit
+          Children:        DW_CHILDREN_yes
+          Attributes:
+            - Attribute:       DW_AT_name
+              Form:            DW_FORM_strp
+            - Attribute:       DW_AT_language
+              Form:            DW_FORM_udata
+        - Code:            0x2
+          Tag:             DW_TAG_structure_type
+          Children:        DW_CHILDREN_no
+          Attributes:
+            - Attribute:       DW_AT_name
+              Form:            DW_FORM_strp
+            - Attribute:       DW_AT_byte_size
+              Form:            DW_FORM_data1
+        - Code:            0x3
+          Tag:             DW_TAG_structure_type
+          Children:        DW_CHILDREN_yes
+          Attributes:
+            - Attribute:       DW_AT_name
+              Form:            DW_FORM_strp
+            - Attribute:       DW_AT_byte_size
+              Form:            DW_FORM_data1
+        - Code:            0x4
+          Tag:             DW_TAG_member
+          Children:        DW_CHILDREN_no
+          Attributes:
+            - Attribute:       DW_AT_name
+              Form:            DW_FORM_strp
+            - Attribute:       DW_AT_type
+              Form:            DW_FORM_ref4
+            - Attribute:       DW_AT_data_member_location
+              Form:            DW_FORM_data1
+  debug_info:
+    - Version:         4
+      AbbrevTableID:   0
+      AbbrOffset:      0x0
+      AddrSize:        8
+      Entries:
+        - AbbrCode:        0x1
+          Values:
+            - Value:           0x0
+            - Value:           0x2
+        - AbbrCode:        0x2
+          Values:
+            - Value:           0x9
+            - Value:           0x0
+        - AbbrCode:        0x3
+          Values:
+            - Value:           0x0d
+            - Value:           0x04
+        - AbbrCode:        0x4
+          Values:
+            - Value:           0x11
+            - Value:           0x11
+            - Value:           0x04
+        - AbbrCode:        0x0
+        - AbbrCode:        0x0
+...
diff --git a/llvm/docs/CommandGuide/llvm-readelf.rst b/llvm/docs/CommandGuide/llvm-readelf.rst
index 284c3aa4..5403fea 100644
--- a/llvm/docs/CommandGuide/llvm-readelf.rst
+++ b/llvm/docs/CommandGuide/llvm-readelf.rst
@@ -143,6 +143,10 @@ OPTIONS
 
  Display all notes.
 
+.. option:: --offloading
+
+ Display list of HIP offload bundles.
+
 .. option:: --pretty-print
 
  When used with :option:`--elf-output-style`, JSON output will be formatted in
diff --git a/llvm/docs/CommandGuide/llvm-readobj.rst b/llvm/docs/CommandGuide/llvm-readobj.rst
index 8bd29ea..0d05b94 100644
--- a/llvm/docs/CommandGuide/llvm-readobj.rst
+++ b/llvm/docs/CommandGuide/llvm-readobj.rst
@@ -104,6 +104,10 @@ file formats.
  Do not demangle symbol names in the output. This option is only for ELF and
  XCOFF file formats. The option is enabled by default.
 
+.. option:: --offloading
+
+ Display list of HIP offload bundles.
+
 .. option:: --relocations, --relocs, -r
 
  Display the relocation entries in the file.
diff --git a/llvm/docs/CommandGuide/llvm-size.rst b/llvm/docs/CommandGuide/llvm-size.rst
index f244769..12e7c58 100644
--- a/llvm/docs/CommandGuide/llvm-size.rst
+++ b/llvm/docs/CommandGuide/llvm-size.rst
@@ -41,6 +41,13 @@ OPTIONS
  as a separate section entry for ``sysv`` output. If not specified, these
  symbols are ignored.
 
+.. option:: --exclude-pagezero
+
+ Do not include the ``__PAGEZERO`` segment when calculating size information
+ for Mach-O files. The ``__PAGEZERO`` segment is a virtual memory region used
+ for memory protection that does not contribute to actual size, and excluding
+ can provide a better representation of actual size.
+
 .. option:: -d
 
  Equivalent to :option:`--radix` with a value of ``10``.
diff --git a/llvm/docs/FuzzingLLVM.rst b/llvm/docs/FuzzingLLVM.rst
index a0355d7..76eb428 100644
--- a/llvm/docs/FuzzingLLVM.rst
+++ b/llvm/docs/FuzzingLLVM.rst
@@ -33,7 +33,7 @@ clang-proto-fuzzer
 A |protobuf fuzzer| that compiles valid C++ programs generated from a protobuf
 class that describes a subset of the C++ language.
 
-This fuzzer accepts clang command line options after `ignore_remaining_args=1`.
+This fuzzer accepts clang command-line options after `ignore_remaining_args=1`.
 For example, the following command will fuzz clang with a higher optimization
 level:
 
@@ -106,7 +106,7 @@ llvm-opt-fuzzer
 
 A |LLVM IR fuzzer| aimed at finding bugs in optimization passes.
 
-It receives optimization pipeline and runs it for each fuzzer input.
+It receives an optimization pipeline and runs it for each fuzzer input.
 
 Interface of this fuzzer almost directly mirrors ``llvm-isel-fuzzer``. Both
 ``mtriple`` and ``passes`` arguments are required. Passes are specified in a
@@ -117,7 +117,7 @@ this format in the doxygen for ``PassBuilder::parsePassPipeline``.
 
    % bin/llvm-opt-fuzzer <corpus-dir> -ignore_remaining_args=1 -mtriple x86_64 -passes instcombine
 
-Similarly to the ``llvm-isel-fuzzer`` arguments in some predefined configurations
+Similarly to the ``llvm-isel-fuzzer``, arguments in some predefined configurations
 might be embedded directly into the binary file name:
 
 .. code-block:: shell
@@ -176,7 +176,7 @@ mutations that a fuzzer in LLVM might want.
 Generic Random Fuzzing
 ----------------------
 
-The most basic form of input mutation is to use the built in mutators of
+The most basic form of input mutation is to use the built-in mutators of
 LibFuzzer. These simply treat the input corpus as a bag of bits and make random
 mutations. This type of fuzzer is good for stressing the surface layers of a
 program, and is good at testing things like lexers, parsers, or binary
@@ -244,7 +244,7 @@ by adding the following two flags to your CMake invocation:
           to avoid building the sanitizers themselves with sanitizers enabled.
 
 .. note:: You may run into issues if you build with BFD ld, which is the
-          default linker on many unix systems. These issues are being tracked
+          default linker on many Unix systems. These issues are being tracked
           in https://llvm.org/PR34636.
 
 Continuously Running and Finding Bugs
@@ -280,6 +280,6 @@ your fuzzer can be built and tested when not built against libFuzzer.
 
 There is also some handling of the CMake config for fuzzers, where you should
 use the ``add_llvm_fuzzer`` to set up fuzzer targets. This function works
-similarly to functions such as ``add_llvm_tool``, but they take care of linking
+similarly to functions such as ``add_llvm_tool``, but it takes care of linking
 to LibFuzzer when appropriate and can be passed the ``DUMMY_MAIN`` argument to
 enable standalone testing.
diff --git a/llvm/docs/GettingInvolved.rst b/llvm/docs/GettingInvolved.rst
index 72716fa..f7e1374 100644
--- a/llvm/docs/GettingInvolved.rst
+++ b/llvm/docs/GettingInvolved.rst
@@ -42,7 +42,7 @@ LLVM welcomes contributions of all kinds. To get started, please review the foll
    in the LLVM system.
 
 :doc:`BugLifeCycle`
-   Describes how bugs are reported, triaged and closed.
+   Describes how bugs are reported, triaged, and closed.
 
 :doc:`CodingStandards`
   Details the LLVM coding standards and provides useful information on writing
@@ -108,7 +108,7 @@ The :doc:`CodeOfConduct` applies to all these forums and mailing lists.
 `Commits Archive (llvm-commits)`__
   This list contains all commit messages that are made when LLVM developers
   commit code changes to the repository. It also serves as a forum for
-  patch review (i.e. send patches here). It is useful for those who want to
+  patch review (i.e., send patches here). It is useful for those who want to
   stay on the bleeding edge of LLVM development. This list is very high
   volume.
 
@@ -121,7 +121,7 @@ The :doc:`CodeOfConduct` applies to all these forums and mailing lists.
   .. __: http://lists.llvm.org/pipermail/llvm-bugs/
 
 `LLVM Announcements`__
-  If you just want project wide announcements such as releases, developers meetings, or blog posts, then you should check out the Announcement category on LLVM Discourse.
+  If you just want project-wide announcements such as releases, developers meetings, or blog posts, then you should check out the Announcement category on LLVM Discourse.
 
   .. __: https://discourse.llvm.org/c/announce/46
 
@@ -473,7 +473,7 @@ join one in your city. Or start a new one if there is none:
 Community wide proposals
 ------------------------
 
-Proposals for massive changes in how the community behaves and how the work flow
+Proposals for large-scale changes in how the community behaves and how the work flow
 can be better.
 
 .. toctree::
@@ -518,7 +518,7 @@ also be seen inline below:
 Note that the web view of the LLVM community calendar shows events in
 Coordinated Universal Time (UTC). If you use Google Calendar, consider
 subscribing to it with the + button in the bottom-right corner to view all
-events in your local timezone alongside your other calendars.
+events in your local time zone alongside your other calendars.
 
 .. _llvm-community-calendar-host-guidance:
 
@@ -554,9 +554,9 @@ An example invite looks as follows
   This event is a meetup for all developers of LLDB. Meeting agendas are posted
   on discourse before the event.
 
-  Attendees are required to adhere to the LLVM Code of Conduct
+  Attendees must adhere to the LLVM Code of Conduct
   (https://llvm.org/docs/CodeOfConduct.html). For any Code of Conduct reports,
-  please contact the organizers, and also email conduct@llvm.org.
+  please contact the organizers and also email conduct@llvm.org.
 
   Agenda/Meeting Minutes: Link to minutes
 
diff --git a/llvm/docs/GlobalISel/InstructionSelect.rst b/llvm/docs/GlobalISel/InstructionSelect.rst
index 9798ae7..5513824 100644
--- a/llvm/docs/GlobalISel/InstructionSelect.rst
+++ b/llvm/docs/GlobalISel/InstructionSelect.rst
@@ -5,8 +5,22 @@ InstructionSelect
 -----------------
 
 This pass transforms generic machine instructions into equivalent
-target-specific instructions.  It traverses the ``MachineFunction`` bottom-up,
-selecting uses before definitions, enabling trivial dead code elimination.
+target-specific instructions.  
+
+The legacy instruction selector, SelectionDAG, iterated over each function's 
+basic block and constructed a dataflow graph. Every backend defines 
+tree patterns in the ``XXXInstrInfo.td``. The legacy selector started
+at the bottom and replaced the SDNodes greedily. 
+
+The GlobalISel's instruction selector traverses the ``MachineFunction`` 
+bottom-up, selecting uses before definitions, enabling trivial dead code 
+elimination. It does that by iterating over the basic blocks in post-order. 
+Each gMIR instruction is then replaced by a MIR instruction when a matching 
+pattern is found. So, when there is a 1:1 mapping between gMIR and MIR, where 
+is the benefit of the global scope? Even in the case of a 1:1 mapping, 
+GlobalISel includes a combiner that can match and fuse multiple gMIR 
+instructions. The scope of the combination is not limited to a basic block, 
+but can extend across the entire function.
 
 .. _api-instructionselector:
 
diff --git a/llvm/include/llvm/ADT/DenseMap.h b/llvm/include/llvm/ADT/DenseMap.h
index bcf3e96..4bda50f 100644
--- a/llvm/include/llvm/ADT/DenseMap.h
+++ b/llvm/include/llvm/ADT/DenseMap.h
@@ -440,10 +440,6 @@ protected:
     }
   }
 
-  static unsigned getHashValue(const KeyT &Val) {
-    return KeyInfoT::getHashValue(Val);
-  }
-
   template <typename LookupKeyT>
   static unsigned getHashValue(const LookupKeyT &Val) {
     return KeyInfoT::getHashValue(Val);
diff --git a/llvm/include/llvm/ADT/EquivalenceClasses.h b/llvm/include/llvm/ADT/EquivalenceClasses.h
index 1a2331c..90d8948 100644
--- a/llvm/include/llvm/ADT/EquivalenceClasses.h
+++ b/llvm/include/llvm/ADT/EquivalenceClasses.h
@@ -180,8 +180,8 @@ public:
   }
 
   /// Returns true if \p V is contained an equivalence class.
-  bool contains(const ElemTy &V) const {
-    return TheMapping.find(V) != TheMapping.end();
+  [[nodiscard]] bool contains(const ElemTy &V) const {
+    return TheMapping.contains(V);
   }
 
   /// getLeaderValue - Return the leader for the specified value that is in the
@@ -256,9 +256,11 @@ public:
       }
       if (!Next) {
         // If the current element is the last element(not leader), set the
-        // successor of the current element's predecessor to null, and set
-        // the 'Leader' field of the class leader to the predecessor element.
-        Pre->Next = nullptr;
+        // successor of the current element's predecessor to null while
+        // preserving the leader bit, and set the 'Leader' field of the class
+        // leader to the predecessor element.
+        Pre->Next = reinterpret_cast<const ECValue *>(
+            static_cast<intptr_t>(Pre->isLeader()));
         Leader->Leader = Pre;
       } else {
         // If the current element is in the middle of class, then simply
diff --git a/llvm/include/llvm/ADT/ImmutableMap.h b/llvm/include/llvm/ADT/ImmutableMap.h
index 3d19ca4..32634a9 100644
--- a/llvm/include/llvm/ADT/ImmutableMap.h
+++ b/llvm/include/llvm/ADT/ImmutableMap.h
@@ -111,25 +111,25 @@ public:
     }
   };
 
-  bool contains(key_type_ref K) const {
+  [[nodiscard]] bool contains(key_type_ref K) const {
     return Root ? Root->contains(K) : false;
   }
 
-  bool operator==(const ImmutableMap &RHS) const {
+  [[nodiscard]] bool operator==(const ImmutableMap &RHS) const {
     return Root && RHS.Root ? Root->isEqual(*RHS.Root.get()) : Root == RHS.Root;
   }
 
-  bool operator!=(const ImmutableMap &RHS) const {
+  [[nodiscard]] bool operator!=(const ImmutableMap &RHS) const {
     return Root && RHS.Root ? Root->isNotEqual(*RHS.Root.get())
                             : Root != RHS.Root;
   }
 
-  TreeTy *getRoot() const {
+  [[nodiscard]] TreeTy *getRoot() const {
     if (Root) { Root->retain(); }
     return Root.get();
   }
 
-  TreeTy *getRootWithoutRetain() const { return Root.get(); }
+  [[nodiscard]] TreeTy *getRootWithoutRetain() const { return Root.get(); }
 
   void manualRetain() {
     if (Root) Root->retain();
@@ -139,7 +139,7 @@ public:
     if (Root) Root->release();
   }
 
-  bool isEmpty() const { return !Root; }
+  [[nodiscard]] bool isEmpty() const { return !Root; }
 
 public:
   //===--------------------------------------------------===//
@@ -163,10 +163,10 @@ public:
     data_type_ref getData() const { return (*this)->second; }
   };
 
-  iterator begin() const { return iterator(Root.get()); }
-  iterator end() const { return iterator(); }
+  [[nodiscard]] iterator begin() const { return iterator(Root.get()); }
+  [[nodiscard]] iterator end() const { return iterator(); }
 
-  data_type* lookup(key_type_ref K) const {
+  [[nodiscard]] data_type *lookup(key_type_ref K) const {
     if (Root) {
       TreeTy* T = Root->find(K);
       if (T) return &T->getValue().second;
@@ -178,7 +178,7 @@ public:
   /// getMaxElement - Returns the <key,value> pair in the ImmutableMap for
   ///  which key is the highest in the ordering of keys in the map.  This
   ///  method returns NULL if the map is empty.
-  value_type* getMaxElement() const {
+  [[nodiscard]] value_type *getMaxElement() const {
     return Root ? &(Root->getMaxElement()->getValue()) : nullptr;
   }
 
@@ -186,7 +186,9 @@ public:
   // Utility methods.
   //===--------------------------------------------------===//
 
-  unsigned getHeight() const { return Root ? Root->getHeight() : 0; }
+  [[nodiscard]] unsigned getHeight() const {
+    return Root ? Root->getHeight() : 0;
+  }
 
   static inline void Profile(FoldingSetNodeID& ID, const ImmutableMap& M) {
     ID.AddPointer(M.Root.get());
@@ -250,7 +252,7 @@ public:
     return ImmutableMapRef(NewT, Factory);
   }
 
-  bool contains(key_type_ref K) const {
+  [[nodiscard]] bool contains(key_type_ref K) const {
     return Root ? Root->contains(K) : false;
   }
 
@@ -258,16 +260,16 @@ public:
     return ImmutableMap<KeyT, ValT>(Factory->getCanonicalTree(Root.get()));
   }
 
-  bool operator==(const ImmutableMapRef &RHS) const {
+  [[nodiscard]] bool operator==(const ImmutableMapRef &RHS) const {
     return Root && RHS.Root ? Root->isEqual(*RHS.Root.get()) : Root == RHS.Root;
   }
 
-  bool operator!=(const ImmutableMapRef &RHS) const {
+  [[nodiscard]] bool operator!=(const ImmutableMapRef &RHS) const {
     return Root && RHS.Root ? Root->isNotEqual(*RHS.Root.get())
                             : Root != RHS.Root;
   }
 
-  bool isEmpty() const { return !Root; }
+  [[nodiscard]] bool isEmpty() const { return !Root; }
 
   //===--------------------------------------------------===//
   // For testing.
@@ -293,10 +295,10 @@ public:
     data_type_ref getData() const { return (*this)->second; }
   };
 
-  iterator begin() const { return iterator(Root.get()); }
-  iterator end() const { return iterator(); }
+  [[nodiscard]] iterator begin() const { return iterator(Root.get()); }
+  [[nodiscard]] iterator end() const { return iterator(); }
 
-  data_type *lookup(key_type_ref K) const {
+  [[nodiscard]] data_type *lookup(key_type_ref K) const {
     if (Root) {
       TreeTy* T = Root->find(K);
       if (T) return &T->getValue().second;
@@ -308,7 +310,7 @@ public:
   /// getMaxElement - Returns the <key,value> pair in the ImmutableMap for
   ///  which key is the highest in the ordering of keys in the map.  This
   ///  method returns NULL if the map is empty.
-  value_type* getMaxElement() const {
+  [[nodiscard]] value_type *getMaxElement() const {
     return Root ? &(Root->getMaxElement()->getValue()) : nullptr;
   }
 
@@ -316,7 +318,9 @@ public:
   // Utility methods.
   //===--------------------------------------------------===//
 
-  unsigned getHeight() const { return Root ? Root->getHeight() : 0; }
+  [[nodiscard]] unsigned getHeight() const {
+    return Root ? Root->getHeight() : 0;
+  }
 
   static inline void Profile(FoldingSetNodeID &ID, const ImmutableMapRef &M) {
     ID.AddPointer(M.Root.get());
diff --git a/llvm/include/llvm/ADT/PackedVector.h b/llvm/include/llvm/ADT/PackedVector.h
index 1146cc4..09c20e3 100644
--- a/llvm/include/llvm/ADT/PackedVector.h
+++ b/llvm/include/llvm/ADT/PackedVector.h
@@ -20,51 +20,6 @@
 
 namespace llvm {
 
-template <typename T, unsigned BitNum, typename BitVectorTy, bool isSigned>
-class PackedVectorBase;
-
-// This won't be necessary if we can specialize members without specializing
-// the parent template.
-template <typename T, unsigned BitNum, typename BitVectorTy>
-class PackedVectorBase<T, BitNum, BitVectorTy, false> {
-protected:
-  static T getValue(const BitVectorTy &Bits, unsigned Idx) {
-    T val = T();
-    for (unsigned i = 0; i != BitNum; ++i)
-      val = T(val | ((Bits[(Idx * BitNum) + i] ? 1UL : 0UL) << i));
-    return val;
-  }
-
-  static void setValue(BitVectorTy &Bits, unsigned Idx, T val) {
-    assert((val >> BitNum) == 0 && "value is too big");
-    for (unsigned i = 0; i != BitNum; ++i)
-      Bits[(Idx * BitNum) + i] = val & (T(1) << i);
-  }
-};
-
-template <typename T, unsigned BitNum, typename BitVectorTy>
-class PackedVectorBase<T, BitNum, BitVectorTy, true> {
-protected:
-  static T getValue(const BitVectorTy &Bits, unsigned Idx) {
-    T val = T();
-    for (unsigned i = 0; i != BitNum-1; ++i)
-      val = T(val | ((Bits[(Idx * BitNum) + i] ? 1UL : 0UL) << i));
-    if (Bits[(Idx * BitNum) + BitNum - 1])
-      val = ~val;
-    return val;
-  }
-
-  static void setValue(BitVectorTy &Bits, unsigned Idx, T val) {
-    if (val < 0) {
-      val = ~val;
-      Bits.set((Idx * BitNum) + BitNum - 1);
-    }
-    assert((val >> (BitNum-1)) == 0 && "value is too big");
-    for (unsigned i = 0; i != BitNum-1; ++i)
-      Bits[(Idx * BitNum) + i] = val & (T(1) << i);
-  }
-};
-
 /// Store a vector of values using a specific number of bits for each
 /// value. Both signed and unsigned types can be used, e.g
 /// @code
@@ -73,15 +28,46 @@ protected:
 /// will create a vector accepting values -2, -1, 0, 1. Any other value will hit
 /// an assertion.
 template <typename T, unsigned BitNum, typename BitVectorTy = BitVector>
-class PackedVector : public PackedVectorBase<T, BitNum, BitVectorTy,
-                                            std::numeric_limits<T>::is_signed> {
+class PackedVector {
   BitVectorTy Bits;
   // Keep track of the number of elements on our own.
   // We always maintain Bits.size() == NumElements * BitNum.
   // Used to avoid an integer division in size().
   unsigned NumElements = 0;
-  using base = PackedVectorBase<T, BitNum, BitVectorTy,
-                                std::numeric_limits<T>::is_signed>;
+
+  static T getValue(const BitVectorTy &Bits, unsigned Idx) {
+    if constexpr (std::numeric_limits<T>::is_signed) {
+      T val = T();
+      for (unsigned i = 0; i != BitNum - 1; ++i)
+        val = T(val | ((Bits[(Idx * BitNum) + i] ? 1UL : 0UL) << i));
+      if (Bits[(Idx * BitNum) + BitNum - 1])
+        val = ~val;
+      return val;
+    } else {
+      T val = T();
+      for (unsigned i = 0; i != BitNum; ++i)
+        val = T(val | ((Bits[(Idx * BitNum) + i] ? 1UL : 0UL) << i));
+      return val;
+    }
+  }
+
+  static void setValue(BitVectorTy &Bits, unsigned Idx, T val) {
+    if constexpr (std::numeric_limits<T>::is_signed) {
+      if (val < 0) {
+        val = ~val;
+        Bits.set((Idx * BitNum) + BitNum - 1);
+      } else {
+        Bits.reset((Idx * BitNum) + BitNum - 1);
+      }
+      assert((val >> (BitNum - 1)) == 0 && "value is too big");
+      for (unsigned i = 0; i != BitNum - 1; ++i)
+        Bits[(Idx * BitNum) + i] = val & (T(1) << i);
+    } else {
+      assert((val >> BitNum) == 0 && "value is too big");
+      for (unsigned i = 0; i != BitNum; ++i)
+        Bits[(Idx * BitNum) + i] = val & (T(1) << i);
+    }
+  }
 
 public:
   class reference {
@@ -97,9 +83,7 @@ public:
       return *this;
     }
 
-    operator T() const {
-      return Vec.getValue(Vec.Bits, Idx);
-    }
+    operator T() const { return Vec.getValue(Vec.Bits, Idx); }
   };
 
   PackedVector() = default;
@@ -128,25 +112,17 @@ public:
   }
 
   void push_back(T val) {
-    resize(size()+1);
-    (*this)[size()-1] = val;
+    resize(size() + 1);
+    (*this)[size() - 1] = val;
   }
 
-  reference operator[](unsigned Idx) {
-    return reference(*this, Idx);
-  }
+  reference operator[](unsigned Idx) { return reference(*this, Idx); }
 
-  T operator[](unsigned Idx) const {
-    return base::getValue(Bits, Idx);
-  }
+  T operator[](unsigned Idx) const { return getValue(Bits, Idx); }
 
-  bool operator==(const PackedVector &RHS) const {
-    return Bits == RHS.Bits;
-  }
+  bool operator==(const PackedVector &RHS) const { return Bits == RHS.Bits; }
 
-  bool operator!=(const PackedVector &RHS) const {
-    return Bits != RHS.Bits;
-  }
+  bool operator!=(const PackedVector &RHS) const { return Bits != RHS.Bits; }
 
   PackedVector &operator|=(const PackedVector &RHS) {
     Bits |= RHS.Bits;
diff --git a/llvm/include/llvm/ADT/SmallVector.h b/llvm/include/llvm/ADT/SmallVector.h
index 36b3243..77805f5 100644
--- a/llvm/include/llvm/ADT/SmallVector.h
+++ b/llvm/include/llvm/ADT/SmallVector.h
@@ -199,17 +199,18 @@ protected:
   }
 
   /// Check whether any part of the range will be invalidated by clearing.
-  void assertSafeToReferenceAfterClear(const T *From, const T *To) {
-    if (From == To)
-      return;
-    this->assertSafeToReferenceAfterResize(From, 0);
-    this->assertSafeToReferenceAfterResize(To - 1, 0);
-  }
-  template <
-      class ItTy,
-      std::enable_if_t<!std::is_same<std::remove_const_t<ItTy>, T *>::value,
-                       bool> = false>
-  void assertSafeToReferenceAfterClear(ItTy, ItTy) {}
+  template <class ItTy>
+  void assertSafeToReferenceAfterClear(ItTy From, ItTy To) {
+    if constexpr (std::is_pointer_v<ItTy> &&
+                  std::is_same_v<
+                      std::remove_const_t<std::remove_pointer_t<ItTy>>,
+                      std::remove_const_t<T>>) {
+      if (From == To)
+        return;
+      this->assertSafeToReferenceAfterResize(From, 0);
+      this->assertSafeToReferenceAfterResize(To - 1, 0);
+    }
+  }
 
   /// Check whether any part of the range will be invalidated by growing.
   template <class ItTy> void assertSafeToAddRange(ItTy From, ItTy To) {
diff --git a/llvm/include/llvm/ADT/SparseSet.h b/llvm/include/llvm/ADT/SparseSet.h
index 395cfc3..9783301 100644
--- a/llvm/include/llvm/ADT/SparseSet.h
+++ b/llvm/include/llvm/ADT/SparseSet.h
@@ -171,23 +171,23 @@ public:
   using iterator = typename DenseT::iterator;
   using const_iterator = typename DenseT::const_iterator;
 
-  const_iterator begin() const { return Dense.begin(); }
-  const_iterator end() const { return Dense.end(); }
-  iterator begin() { return Dense.begin(); }
-  iterator end() { return Dense.end(); }
+  [[nodiscard]] const_iterator begin() const { return Dense.begin(); }
+  [[nodiscard]] const_iterator end() const { return Dense.end(); }
+  [[nodiscard]] iterator begin() { return Dense.begin(); }
+  [[nodiscard]] iterator end() { return Dense.end(); }
 
   /// empty - Returns true if the set is empty.
   ///
   /// This is not the same as BitVector::empty().
   ///
-  bool empty() const { return Dense.empty(); }
+  [[nodiscard]] bool empty() const { return Dense.empty(); }
 
   /// size - Returns the number of elements in the set.
   ///
   /// This is not the same as BitVector::size() which returns the size of the
   /// universe.
   ///
-  size_type size() const { return Dense.size(); }
+  [[nodiscard]] size_type size() const { return Dense.size(); }
 
   /// clear - Clears the set.  This is a very fast constant time operation.
   ///
@@ -222,21 +222,27 @@ public:
   /// @param   Key A valid key to find.
   /// @returns An iterator to the element identified by key, or end().
   ///
-  iterator find(const KeyT &Key) { return findIndex(KeyIndexOf(Key)); }
+  [[nodiscard]] iterator find(const KeyT &Key) {
+    return findIndex(KeyIndexOf(Key));
+  }
 
-  const_iterator find(const KeyT &Key) const {
+  [[nodiscard]] const_iterator find(const KeyT &Key) const {
     return const_cast<SparseSet *>(this)->findIndex(KeyIndexOf(Key));
   }
 
   /// Check if the set contains the given \c Key.
   ///
   /// @param Key A valid key to find.
-  bool contains(const KeyT &Key) const { return find(Key) != end(); }
+  [[nodiscard]] bool contains(const KeyT &Key) const {
+    return find(Key) != end();
+  }
 
   /// count - Returns 1 if this set contains an element identified by Key,
   /// 0 otherwise.
   ///
-  size_type count(const KeyT &Key) const { return contains(Key) ? 1 : 0; }
+  [[nodiscard]] size_type count(const KeyT &Key) const {
+    return contains(Key) ? 1 : 0;
+  }
 
   /// insert - Attempts to insert a new element.
   ///
diff --git a/llvm/include/llvm/ADT/StringMap.h b/llvm/include/llvm/ADT/StringMap.h
index 2c146fb..01cbf2d3 100644
--- a/llvm/include/llvm/ADT/StringMap.h
+++ b/llvm/include/llvm/ADT/StringMap.h
@@ -102,18 +102,18 @@ public:
     return reinterpret_cast<StringMapEntryBase *>(TombstoneIntVal);
   }
 
-  unsigned getNumBuckets() const { return NumBuckets; }
-  unsigned getNumItems() const { return NumItems; }
+  [[nodiscard]] unsigned getNumBuckets() const { return NumBuckets; }
+  [[nodiscard]] unsigned getNumItems() const { return NumItems; }
 
-  bool empty() const { return NumItems == 0; }
-  unsigned size() const { return NumItems; }
+  [[nodiscard]] bool empty() const { return NumItems == 0; }
+  [[nodiscard]] unsigned size() const { return NumItems; }
 
   /// Returns the hash value that will be used for the given string.
   /// This allows precomputing the value and passing it explicitly
   /// to some of the functions.
   /// The implementation of this function is not guaranteed to be stable
   /// and may change.
-  LLVM_ABI static uint32_t hash(StringRef Key);
+  [[nodiscard]] LLVM_ABI static uint32_t hash(StringRef Key);
 
   void swap(StringMapImpl &Other) {
     std::swap(TheTable, Other.TheTable);
@@ -220,30 +220,35 @@ public:
   using const_iterator = StringMapIterBase<ValueTy, true>;
   using iterator = StringMapIterBase<ValueTy, false>;
 
-  iterator begin() { return iterator(TheTable, NumBuckets != 0); }
-  iterator end() { return iterator(TheTable + NumBuckets); }
-  const_iterator begin() const {
+  [[nodiscard]] iterator begin() { return iterator(TheTable, NumBuckets != 0); }
+  [[nodiscard]] iterator end() { return iterator(TheTable + NumBuckets); }
+  [[nodiscard]] const_iterator begin() const {
     return const_iterator(TheTable, NumBuckets != 0);
   }
-  const_iterator end() const { return const_iterator(TheTable + NumBuckets); }
+  [[nodiscard]] const_iterator end() const {
+    return const_iterator(TheTable + NumBuckets);
+  }
 
-  iterator_range<StringMapKeyIterator<ValueTy>> keys() const {
+  [[nodiscard]] iterator_range<StringMapKeyIterator<ValueTy>> keys() const {
     return make_range(StringMapKeyIterator<ValueTy>(begin()),
                       StringMapKeyIterator<ValueTy>(end()));
   }
 
-  iterator find(StringRef Key) { return find(Key, hash(Key)); }
+  [[nodiscard]] iterator find(StringRef Key) { return find(Key, hash(Key)); }
 
-  iterator find(StringRef Key, uint32_t FullHashValue) {
+  [[nodiscard]] iterator find(StringRef Key, uint32_t FullHashValue) {
     int Bucket = FindKey(Key, FullHashValue);
     if (Bucket == -1)
       return end();
     return iterator(TheTable + Bucket);
   }
 
-  const_iterator find(StringRef Key) const { return find(Key, hash(Key)); }
+  [[nodiscard]] const_iterator find(StringRef Key) const {
+    return find(Key, hash(Key));
+  }
 
-  const_iterator find(StringRef Key, uint32_t FullHashValue) const {
+  [[nodiscard]] const_iterator find(StringRef Key,
+                                    uint32_t FullHashValue) const {
     int Bucket = FindKey(Key, FullHashValue);
     if (Bucket == -1)
       return end();
@@ -252,7 +257,7 @@ public:
 
   /// lookup - Return the entry for the specified key, or a default
   /// constructed value if no such entry exists.
-  ValueTy lookup(StringRef Key) const {
+  [[nodiscard]] ValueTy lookup(StringRef Key) const {
     const_iterator Iter = find(Key);
     if (Iter != end())
       return Iter->second;
@@ -261,7 +266,7 @@ public:
 
   /// at - Return the entry for the specified key, or abort if no such
   /// entry exists.
-  const ValueTy &at(StringRef Val) const {
+  [[nodiscard]] const ValueTy &at(StringRef Val) const {
     auto Iter = this->find(Val);
     assert(Iter != this->end() && "StringMap::at failed due to a missing key");
     return Iter->second;
@@ -272,18 +277,22 @@ public:
   ValueTy &operator[](StringRef Key) { return try_emplace(Key).first->second; }
 
   /// contains - Return true if the element is in the map, false otherwise.
-  bool contains(StringRef Key) const { return find(Key) != end(); }
+  [[nodiscard]] bool contains(StringRef Key) const {
+    return find(Key) != end();
+  }
 
   /// count - Return 1 if the element is in the map, 0 otherwise.
-  size_type count(StringRef Key) const { return contains(Key) ? 1 : 0; }
+  [[nodiscard]] size_type count(StringRef Key) const {
+    return contains(Key) ? 1 : 0;
+  }
 
   template <typename InputTy>
-  size_type count(const StringMapEntry<InputTy> &MapEntry) const {
+  [[nodiscard]] size_type count(const StringMapEntry<InputTy> &MapEntry) const {
     return count(MapEntry.getKey());
   }
 
   /// equal - check whether both of the containers are equal.
-  bool operator==(const StringMap &RHS) const {
+  [[nodiscard]] bool operator==(const StringMap &RHS) const {
     if (size() != RHS.size())
       return false;
 
@@ -302,7 +311,9 @@ public:
     return true;
   }
 
-  bool operator!=(const StringMap &RHS) const { return !(*this == RHS); }
+  [[nodiscard]] bool operator!=(const StringMap &RHS) const {
+    return !(*this == RHS);
+  }
 
   /// insert - Insert the specified key/value pair into the map.  If the key
   /// already exists in the map, return false and ignore the request, otherwise
@@ -447,8 +458,12 @@ public:
       AdvancePastEmptyBuckets();
   }
 
-  reference operator*() const { return *static_cast<value_type *>(*Ptr); }
-  pointer operator->() const { return static_cast<value_type *>(*Ptr); }
+  [[nodiscard]] reference operator*() const {
+    return *static_cast<value_type *>(*Ptr);
+  }
+  [[nodiscard]] pointer operator->() const {
+    return static_cast<value_type *>(*Ptr);
+  }
 
   StringMapIterBase &operator++() { // Preincrement
     ++Ptr;
diff --git a/llvm/include/llvm/ADT/StringSet.h b/llvm/include/llvm/ADT/StringSet.h
index b485342..c8be3f2 100644
--- a/llvm/include/llvm/ADT/StringSet.h
+++ b/llvm/include/llvm/ADT/StringSet.h
@@ -57,7 +57,9 @@ public:
   }
 
   /// Check if the set contains the given \c key.
-  bool contains(StringRef key) const { return Base::contains(key); }
+  [[nodiscard]] bool contains(StringRef key) const {
+    return Base::contains(key);
+  }
 };
 
 } // end namespace llvm
diff --git a/llvm/include/llvm/Bitstream/BitstreamWriter.h b/llvm/include/llvm/Bitstream/BitstreamWriter.h
index 5f53681..a293864 100644
--- a/llvm/include/llvm/Bitstream/BitstreamWriter.h
+++ b/llvm/include/llvm/Bitstream/BitstreamWriter.h
@@ -87,7 +87,7 @@ class BitstreamWriter {
 
   void WriteWord(unsigned Value) {
     Value =
-        support::endian::byte_swap<uint32_t, llvm::endianness::little>(Value);
+        support::endian::byte_swap<uint32_t>(Value, llvm::endianness::little);
     Buffer.append(reinterpret_cast<const char *>(&Value),
                   reinterpret_cast<const char *>(&Value + 1));
   }
diff --git a/llvm/include/llvm/CAS/FileOffset.h b/llvm/include/llvm/CAS/FileOffset.h
new file mode 100644
index 0000000..21d045e
--- /dev/null
+++ b/llvm/include/llvm/CAS/FileOffset.h
@@ -0,0 +1,39 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This file declares interface for FileOffset that represent stored data at an
+/// offset from the beginning of a file.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CAS_FILEOFFSET_H
+#define LLVM_CAS_FILEOFFSET_H
+
+#include <cstdlib>
+
+namespace llvm::cas {
+
+/// FileOffset is a wrapper around `uint64_t` to represent the offset of data
+/// from the beginning of the file.
+class FileOffset {
+public:
+  uint64_t get() const { return Offset; }
+
+  explicit operator bool() const { return Offset; }
+
+  FileOffset() = default;
+  explicit FileOffset(uint64_t Offset) : Offset(Offset) {}
+
+private:
+  uint64_t Offset = 0;
+};
+
+} // namespace llvm::cas
+
+#endif // LLVM_CAS_FILEOFFSET_H
diff --git a/llvm/include/llvm/CAS/OnDiskTrieRawHashMap.h b/llvm/include/llvm/CAS/OnDiskTrieRawHashMap.h
new file mode 100644
index 0000000..5e41bf6
--- /dev/null
+++ b/llvm/include/llvm/CAS/OnDiskTrieRawHashMap.h
@@ -0,0 +1,236 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This file declares interface for OnDiskTrieRawHashMap, a thread-safe and
+/// (mostly) lock-free hash map stored as trie and backed by persistent files on
+/// disk.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CAS_ONDISKTRIERAWHASHMAP_H
+#define LLVM_CAS_ONDISKTRIERAWHASHMAP_H
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/STLFunctionalExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/CAS/FileOffset.h"
+#include "llvm/Support/Error.h"
+#include <optional>
+
+namespace llvm {
+
+class raw_ostream;
+
+namespace cas {
+
+/// OnDiskTrieRawHashMap is a persistent trie data structure used as hash maps.
+/// The keys are fixed length, and are expected to be binary hashes with a
+/// normal distribution.
+///
+/// - Thread-safety is achieved through the use of atomics within a shared
+///   memory mapping. Atomic access does not work on networked filesystems.
+/// - Filesystem locks are used, but only sparingly:
+///     - during initialization, for creating / opening an existing store;
+///     - for the lifetime of the instance, a shared/reader lock is held
+///     - during destruction, if there are no concurrent readers, to shrink the
+///       files to their minimum size.
+/// - Path is used as a directory:
+///     - "index" stores the root trie and subtries.
+///     - "data" stores (most of) the entries, like a bump-ptr-allocator.
+///     - Large entries are stored externally in a file named by the key.
+/// - Code is system-dependent and binary format itself is not portable. These
+///   are not artifacts that can/should be moved between different systems; they
+///   are only appropriate for local storage.
+class OnDiskTrieRawHashMap {
+public:
+  LLVM_DUMP_METHOD void dump() const;
+  void
+  print(raw_ostream &OS,
+        function_ref<void(ArrayRef<char>)> PrintRecordData = nullptr) const;
+
+public:
+  /// Const value proxy to access the records stored in TrieRawHashMap.
+  struct ConstValueProxy {
+    ConstValueProxy() = default;
+    ConstValueProxy(ArrayRef<uint8_t> Hash, ArrayRef<char> Data)
+        : Hash(Hash), Data(Data) {}
+    ConstValueProxy(ArrayRef<uint8_t> Hash, StringRef Data)
+        : Hash(Hash), Data(Data.begin(), Data.size()) {}
+
+    ArrayRef<uint8_t> Hash;
+    ArrayRef<char> Data;
+  };
+
+  /// Value proxy to access the records stored in TrieRawHashMap.
+  struct ValueProxy {
+    operator ConstValueProxy() const { return ConstValueProxy(Hash, Data); }
+
+    ValueProxy() = default;
+    ValueProxy(ArrayRef<uint8_t> Hash, MutableArrayRef<char> Data)
+        : Hash(Hash), Data(Data) {}
+
+    ArrayRef<uint8_t> Hash;
+    MutableArrayRef<char> Data;
+  };
+
+  /// Validate the trie data structure.
+  ///
+  /// Callback receives the file offset to the data entry and the data stored.
+  Error validate(
+      function_ref<Error(FileOffset, ConstValueProxy)> RecordVerifier) const;
+
+  /// Check the valid range of file offset for OnDiskTrieRawHashMap.
+  static bool validOffset(FileOffset Offset) {
+    return Offset.get() < (1LL << 48);
+  }
+
+public:
+  /// Template class to implement a `pointer` type into the trie data structure.
+  ///
+  /// It provides pointer-like operation, e.g., dereference to get underlying
+  /// data. It also reserves the top 16 bits of the pointer value, which can be
+  /// used to pack additional information if needed.
+  template <class ProxyT> class PointerImpl {
+  public:
+    FileOffset getOffset() const {
+      return FileOffset(OffsetLow32 | (uint64_t)OffsetHigh16 << 32);
+    }
+
+    explicit operator bool() const { return IsValue; }
+
+    const ProxyT &operator*() const {
+      assert(IsValue);
+      return Value;
+    }
+    const ProxyT *operator->() const {
+      assert(IsValue);
+      return &Value;
+    }
+
+    PointerImpl() = default;
+
+  protected:
+    PointerImpl(ProxyT Value, FileOffset Offset, bool IsValue = true)
+        : Value(Value), OffsetLow32((uint64_t)Offset.get()),
+          OffsetHigh16((uint64_t)Offset.get() >> 32), IsValue(IsValue) {
+      if (IsValue)
+        assert(validOffset(Offset));
+    }
+
+    ProxyT Value;
+    uint32_t OffsetLow32 = 0;
+    uint16_t OffsetHigh16 = 0;
+
+    // True if points to a value (not a "nullptr"). Use an extra field because
+    // 0 can be a valid offset.
+    bool IsValue = false;
+  };
+
+  class pointer;
+  class const_pointer : public PointerImpl<ConstValueProxy> {
+  public:
+    const_pointer() = default;
+
+  private:
+    friend class pointer;
+    friend class OnDiskTrieRawHashMap;
+    using const_pointer::PointerImpl::PointerImpl;
+  };
+
+  class pointer : public PointerImpl<ValueProxy> {
+  public:
+    operator const_pointer() const {
+      return const_pointer(Value, getOffset(), IsValue);
+    }
+
+    pointer() = default;
+
+  private:
+    friend class OnDiskTrieRawHashMap;
+    using pointer::PointerImpl::PointerImpl;
+  };
+
+  /// Find the value from hash.
+  ///
+  /// \returns pointer to the value if exists, otherwise returns a non-value
+  /// pointer that evaluates to `false` when convert to boolean.
+  const_pointer find(ArrayRef<uint8_t> Hash) const;
+
+  /// Helper function to recover a pointer into the trie from file offset.
+  Expected<const_pointer> recoverFromFileOffset(FileOffset Offset) const;
+
+  using LazyInsertOnConstructCB =
+      function_ref<void(FileOffset TentativeOffset, ValueProxy TentativeValue)>;
+  using LazyInsertOnLeakCB =
+      function_ref<void(FileOffset TentativeOffset, ValueProxy TentativeValue,
+                        FileOffset FinalOffset, ValueProxy FinalValue)>;
+
+  /// Insert lazily.
+  ///
+  /// \p OnConstruct is called when ready to insert a value, after allocating
+  /// space for the data. It is called at most once.
+  ///
+  /// \p OnLeak is called only if \p OnConstruct has been called and a race
+  /// occurred before insertion, causing the tentative offset and data to be
+  /// abandoned. This allows clients to clean up other results or update any
+  /// references.
+  ///
+  /// NOTE: Does *not* guarantee that \p OnConstruct is only called on success.
+  /// The in-memory \a TrieRawHashMap uses LazyAtomicPointer to synchronize
+  /// simultaneous writes, but that seems dangerous to use in a memory-mapped
+  /// file in case a process crashes in the busy state.
+  Expected<pointer> insertLazy(ArrayRef<uint8_t> Hash,
+                               LazyInsertOnConstructCB OnConstruct = nullptr,
+                               LazyInsertOnLeakCB OnLeak = nullptr);
+
+  Expected<pointer> insert(const ConstValueProxy &Value) {
+    return insertLazy(Value.Hash, [&](FileOffset, ValueProxy Allocated) {
+      assert(Allocated.Hash == Value.Hash);
+      assert(Allocated.Data.size() == Value.Data.size());
+      llvm::copy(Value.Data, Allocated.Data.begin());
+    });
+  }
+
+  size_t size() const;
+  size_t capacity() const;
+
+  /// Gets or creates a file at \p Path with a hash-mapped trie named \p
+  /// TrieName. The hash size is \p NumHashBits (in bits) and the records store
+  /// data of size \p DataSize (in bytes).
+  ///
+  /// \p MaxFileSize controls the maximum file size to support, limiting the
+  /// size of the \a mapped_file_region. \p NewFileInitialSize is the starting
+  /// size if a new file is created.
+  ///
+  /// \p NewTableNumRootBits and \p NewTableNumSubtrieBits are hints to
+  /// configure the trie, if it doesn't already exist.
+  ///
+  /// \pre NumHashBits is a multiple of 8 (byte-aligned).
+  static Expected<OnDiskTrieRawHashMap>
+  create(const Twine &Path, const Twine &TrieName, size_t NumHashBits,
+         uint64_t DataSize, uint64_t MaxFileSize,
+         std::optional<uint64_t> NewFileInitialSize,
+         std::optional<size_t> NewTableNumRootBits = std::nullopt,
+         std::optional<size_t> NewTableNumSubtrieBits = std::nullopt);
+
+  OnDiskTrieRawHashMap(OnDiskTrieRawHashMap &&RHS);
+  OnDiskTrieRawHashMap &operator=(OnDiskTrieRawHashMap &&RHS);
+  ~OnDiskTrieRawHashMap();
+
+private:
+  struct ImplType;
+  explicit OnDiskTrieRawHashMap(std::unique_ptr<ImplType> Impl);
+  std::unique_ptr<ImplType> Impl;
+};
+
+} // namespace cas
+} // namespace llvm
+
+#endif // LLVM_CAS_ONDISKTRIERAWHASHMAP_H
diff --git a/llvm/include/llvm/CodeGen/DebugHandlerBase.h b/llvm/include/llvm/CodeGen/DebugHandlerBase.h
index 2849497..fee4bb1 100644
--- a/llvm/include/llvm/CodeGen/DebugHandlerBase.h
+++ b/llvm/include/llvm/CodeGen/DebugHandlerBase.h
@@ -144,6 +144,8 @@ public:
   static bool isUnsignedDIType(const DIType *Ty);
 
   const InstructionOrdering &getInstOrdering() const { return InstOrdering; }
+
+  const LexicalScopes &getLexicalScopes() const { return LScopes; }
 };
 
 } // namespace llvm
diff --git a/llvm/include/llvm/CodeGen/LexicalScopes.h b/llvm/include/llvm/CodeGen/LexicalScopes.h
index 4172e90..993df54 100644
--- a/llvm/include/llvm/CodeGen/LexicalScopes.h
+++ b/llvm/include/llvm/CodeGen/LexicalScopes.h
@@ -141,12 +141,18 @@ class LexicalScopes {
 public:
   LexicalScopes() = default;
 
+  /// Scan module to build subprogram-to-function map.
+  LLVM_ABI void initialize(const Module &);
+
   /// Scan machine function and constuct lexical scope nest, resets
   /// the instance if necessary.
-  LLVM_ABI void initialize(const MachineFunction &);
+  LLVM_ABI void scanFunction(const MachineFunction &);
+
+  /// Reset the instance so that it's prepared for another module.
+  LLVM_ABI void resetModule();
 
-  /// Release memory.
-  LLVM_ABI void reset();
+  /// Reset the instance so that it's prepared for another function.
+  LLVM_ABI void resetFunction();
 
   /// Return true if there is any lexical scope information available.
   bool empty() { return CurrentFnLexicalScope == nullptr; }
@@ -196,6 +202,11 @@ public:
   /// Find or create an abstract lexical scope.
   LLVM_ABI LexicalScope *getOrCreateAbstractScope(const DILocalScope *Scope);
 
+  /// Get function to which the given subprogram is attached, if exists.
+  const Function *getFunction(const DISubprogram *SP) const {
+    return FunctionMap.lookup(SP);
+  }
+
 private:
   /// Find lexical scope for the given Scope/IA. If not available
   /// then create new lexical scope.
@@ -225,6 +236,9 @@ private:
 
   const MachineFunction *MF = nullptr;
 
+  /// Mapping between DISubprograms and IR functions.
+  DenseMap<const DISubprogram *, const Function *> FunctionMap;
+
   /// Tracks the scopes in the current function.
   // Use an unordered_map to ensure value pointer validity over insertion.
   std::unordered_map<const DILocalScope *, LexicalScope> LexicalScopeMap;
diff --git a/llvm/include/llvm/CodeGen/RDFGraph.h b/llvm/include/llvm/CodeGen/RDFGraph.h
index 8a93afb..6bb6033 100644
--- a/llvm/include/llvm/CodeGen/RDFGraph.h
+++ b/llvm/include/llvm/CodeGen/RDFGraph.h
@@ -447,7 +447,7 @@ private:
   AllocatorTy MemPool;
 };
 
-using RegisterSet = std::set<RegisterRef>;
+using RegisterSet = std::set<RegisterRef, RegisterRefLess>;
 
 struct TargetOperandInfo {
   TargetOperandInfo(const TargetInstrInfo &tii) : TII(tii) {}
diff --git a/llvm/include/llvm/CodeGen/RDFRegisters.h b/llvm/include/llvm/CodeGen/RDFRegisters.h
index 4a9a406..82027ca 100644
--- a/llvm/include/llvm/CodeGen/RDFRegisters.h
+++ b/llvm/include/llvm/CodeGen/RDFRegisters.h
@@ -199,6 +199,33 @@ private:
   std::vector<AliasInfo> AliasInfos;
 };
 
+struct RegisterRefEqualTo {
+  constexpr RegisterRefEqualTo(const llvm::rdf::PhysicalRegisterInfo &pri)
+      : PRI(&pri) {}
+
+  bool operator()(llvm::rdf::RegisterRef A, llvm::rdf::RegisterRef B) const {
+    return PRI->equal_to(A, B);
+  }
+
+private:
+  // Make it a pointer just in case. See comment in `RegisterRefLess` below.
+  const llvm::rdf::PhysicalRegisterInfo *PRI;
+};
+
+struct RegisterRefLess {
+  constexpr RegisterRefLess(const llvm::rdf::PhysicalRegisterInfo &pri)
+      : PRI(&pri) {}
+
+  bool operator()(llvm::rdf::RegisterRef A, llvm::rdf::RegisterRef B) const {
+    return PRI->less(A, B);
+  }
+
+private:
+  // Make it a pointer because apparently some versions of MSVC use std::swap
+  // on the comparator object.
+  const llvm::rdf::PhysicalRegisterInfo *PRI;
+};
+
 struct RegisterAggr {
   RegisterAggr(const PhysicalRegisterInfo &pri)
       : Units(pri.getTRI().getNumRegUnits()), PRI(pri) {}
@@ -334,18 +361,6 @@ template <> struct hash<llvm::rdf::RegisterAggr> {
   }
 };
 
-template <> struct equal_to<llvm::rdf::RegisterRef> {
-  constexpr equal_to(const llvm::rdf::PhysicalRegisterInfo &pri) : PRI(&pri) {}
-
-  bool operator()(llvm::rdf::RegisterRef A, llvm::rdf::RegisterRef B) const {
-    return PRI->equal_to(A, B);
-  }
-
-private:
-  // Make it a pointer just in case. See comment in `less` below.
-  const llvm::rdf::PhysicalRegisterInfo *PRI;
-};
-
 template <> struct equal_to<llvm::rdf::RegisterAggr> {
   bool operator()(const llvm::rdf::RegisterAggr &A,
                   const llvm::rdf::RegisterAggr &B) const {
@@ -353,23 +368,10 @@ template <> struct equal_to<llvm::rdf::RegisterAggr> {
   }
 };
 
-template <> struct less<llvm::rdf::RegisterRef> {
-  constexpr less(const llvm::rdf::PhysicalRegisterInfo &pri) : PRI(&pri) {}
-
-  bool operator()(llvm::rdf::RegisterRef A, llvm::rdf::RegisterRef B) const {
-    return PRI->less(A, B);
-  }
-
-private:
-  // Make it a pointer because apparently some versions of MSVC use std::swap
-  // on the std::less specialization.
-  const llvm::rdf::PhysicalRegisterInfo *PRI;
-};
-
 } // namespace std
 
 namespace llvm::rdf {
-using RegisterSet = std::set<RegisterRef, std::less<RegisterRef>>;
+using RegisterSet = std::set<RegisterRef, RegisterRefLess>;
 } // namespace llvm::rdf
 
 #endif // LLVM_CODEGEN_RDFREGISTERS_H
diff --git a/llvm/include/llvm/Frontend/OpenMP/ClauseT.h b/llvm/include/llvm/Frontend/OpenMP/ClauseT.h
index 1ade9ce..db781b58 100644
--- a/llvm/include/llvm/Frontend/OpenMP/ClauseT.h
+++ b/llvm/include/llvm/Frontend/OpenMP/ClauseT.h
@@ -1268,6 +1268,15 @@ struct WriteT {
   using EmptyTrait = std::true_type;
 };
 
+// V6: [6.4.7] Looprange clause
+template <typename T, typename I, typename E> struct LoopRangeT {
+  using Begin = E;
+  using End = E;
+
+  using TupleTrait = std::true_type;
+  std::tuple<Begin, End> t;
+};
+
 // ---
 
 template <typename T, typename I, typename E>
@@ -1300,8 +1309,8 @@ using TupleClausesT =
                  DoacrossT<T, I, E>, DynGroupprivateT<T, I, E>, FromT<T, I, E>,
                  GrainsizeT<T, I, E>, IfT<T, I, E>, InitT<T, I, E>,
                  InReductionT<T, I, E>, LastprivateT<T, I, E>, LinearT<T, I, E>,
-                 MapT<T, I, E>, NumTasksT<T, I, E>, OrderT<T, I, E>,
-                 ReductionT<T, I, E>, ScheduleT<T, I, E>,
+                 LoopRangeT<T, I, E>, MapT<T, I, E>, NumTasksT<T, I, E>,
+                 OrderT<T, I, E>, ReductionT<T, I, E>, ScheduleT<T, I, E>,
                  TaskReductionT<T, I, E>, ToT<T, I, E>>;
 
 template <typename T, typename I, typename E>
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMP.td b/llvm/include/llvm/Frontend/OpenMP/OMP.td
index ffefa26..38f95a1 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMP.td
+++ b/llvm/include/llvm/Frontend/OpenMP/OMP.td
@@ -284,6 +284,10 @@ def OMPC_Linear : Clause<[Spelling<"linear">]> {
 def OMPC_Link : Clause<[Spelling<"link">]> {
   let flangClass = "OmpObjectList";
 }
+def OMPC_LoopRange : Clause<[Spelling<"looprange">]> {
+  let clangClass = "OMPLoopRangeClause";
+  let flangClass = "OmpLoopRangeClause";
+}
 def OMPC_Map : Clause<[Spelling<"map">]> {
   let clangClass = "OMPMapClause";
   let flangClass = "OmpMapClause";
@@ -902,6 +906,11 @@ def OMP_Groupprivate : Directive<[Spelling<"groupprivate">]> {
   let category = CA_Declarative;
   let languages = [L_C, L_Fortran];
 }
+def OMP_Fuse : Directive<[Spelling<"fuse">]> {
+  let allowedOnceClauses = [VersionedClause<OMPC_LoopRange, 60>];
+  let association = AS_Block;
+  let category = CA_Executable;
+}
 def OMP_Interchange : Directive<[Spelling<"interchange">]> {
   let allowedOnceClauses = [
     VersionedClause<OMPC_Permutation>,
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index 3c4ed94..96da698 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -960,8 +960,12 @@ def int_instrprof_mcdc_tvbitmap_update : Intrinsic<[],
                                         [llvm_ptr_ty, llvm_i64_ty,
                                          llvm_i32_ty, llvm_ptr_ty]>;
 
-def int_call_preallocated_setup : DefaultAttrsIntrinsic<[llvm_token_ty], [llvm_i32_ty]>;
-def int_call_preallocated_arg : DefaultAttrsIntrinsic<[llvm_ptr_ty], [llvm_token_ty, llvm_i32_ty]>;
+def int_call_preallocated_setup
+    : DefaultAttrsIntrinsic<[llvm_token_ty], [llvm_i32_ty],
+                            [ImmArg<ArgIndex<0>>]>;
+def int_call_preallocated_arg
+    : DefaultAttrsIntrinsic<[llvm_ptr_ty], [llvm_token_ty, llvm_i32_ty],
+                            [ImmArg<ArgIndex<1>>]>;
 def int_call_preallocated_teardown : DefaultAttrsIntrinsic<[], [llvm_token_ty]>;
 
 // This intrinsic is intentionally undocumented and users shouldn't call it;
diff --git a/llvm/include/llvm/IR/ProfDataUtils.h b/llvm/include/llvm/IR/ProfDataUtils.h
index de9675f..e97160e 100644
--- a/llvm/include/llvm/IR/ProfDataUtils.h
+++ b/llvm/include/llvm/IR/ProfDataUtils.h
@@ -185,6 +185,14 @@ inline uint32_t scaleBranchCount(uint64_t Count, uint64_t Scale) {
 LLVM_ABI void setExplicitlyUnknownBranchWeights(Instruction &I,
                                                 StringRef PassName);
 
+/// Like setExplicitlyUnknownBranchWeights(...), but only sets unknown branch
+/// weights in the new instruction if the parent function of the original
+/// instruction has an entry count. This is to not confuse users by injecting
+/// profile data into non-profiled functions.
+LLVM_ABI void setExplicitlyUnknownBranchWeightsIfProfiled(Instruction &I,
+                                                          Function &F,
+                                                          StringRef PassName);
+
 /// Analogous to setExplicitlyUnknownBranchWeights, but for functions and their
 /// entry counts.
 LLVM_ABI void setExplicitlyUnknownFunctionEntryCount(Function &F,
diff --git a/llvm/include/llvm/Object/OffloadBundle.h b/llvm/include/llvm/Object/OffloadBundle.h
index f4d5a1d..18be62b 100644
--- a/llvm/include/llvm/Object/OffloadBundle.h
+++ b/llvm/include/llvm/Object/OffloadBundle.h
@@ -161,7 +161,7 @@ public:
     OffsetStr.getAsInteger(10, O);
     Str = Str.drop_front(OffsetStr.size());
 
-    if (Str.consume_front("&size="))
+    if (!Str.consume_front("&size="))
       return createStringError(object_error::parse_failed,
                                "Reading 'size' in URI");
 
diff --git a/llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h b/llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h
index 7d1a85b..e099581 100644
--- a/llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h
+++ b/llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h
@@ -1215,19 +1215,19 @@ namespace accessors {
 /// Return the structural hash associated with the function.
 template <class FuncRecordTy, llvm::endianness Endian>
 uint64_t getFuncHash(const FuncRecordTy *Record) {
-  return support::endian::byte_swap<uint64_t, Endian>(Record->FuncHash);
+  return support::endian::byte_swap<uint64_t>(Record->FuncHash, Endian);
 }
 
 /// Return the coverage map data size for the function.
 template <class FuncRecordTy, llvm::endianness Endian>
 uint64_t getDataSize(const FuncRecordTy *Record) {
-  return support::endian::byte_swap<uint32_t, Endian>(Record->DataSize);
+  return support::endian::byte_swap<uint32_t>(Record->DataSize, Endian);
 }
 
 /// Return the function lookup key. The value is considered opaque.
 template <class FuncRecordTy, llvm::endianness Endian>
 uint64_t getFuncNameRef(const FuncRecordTy *Record) {
-  return support::endian::byte_swap<uint64_t, Endian>(Record->NameRef);
+  return support::endian::byte_swap<uint64_t>(Record->NameRef, Endian);
 }
 
 /// Return the PGO name of the function. Used for formats in which the name is
@@ -1280,14 +1280,14 @@ struct CovMapFunctionRecordV1 {
 
   /// Return function lookup key. The value is consider opaque.
   template <llvm::endianness Endian> IntPtrT getFuncNameRef() const {
-    return support::endian::byte_swap<IntPtrT, Endian>(NamePtr);
+    return support::endian::byte_swap<IntPtrT>(NamePtr, Endian);
   }
 
   /// Return the PGO name of the function.
   template <llvm::endianness Endian>
   Error getFuncName(InstrProfSymtab &ProfileNames, StringRef &FuncName) const {
     IntPtrT NameRef = getFuncNameRef<Endian>();
-    uint32_t NameS = support::endian::byte_swap<uint32_t, Endian>(NameSize);
+    uint32_t NameS = support::endian::byte_swap<uint32_t>(NameSize, Endian);
     FuncName = ProfileNames.getFuncName(NameRef, NameS);
     if (NameS && FuncName.empty())
       return make_error<CoverageMapError>(coveragemap_error::malformed,
@@ -1385,7 +1385,7 @@ struct CovMapFunctionRecordV3 {
 
   /// Get the filename set reference.
   template <llvm::endianness Endian> uint64_t getFilenamesRef() const {
-    return support::endian::byte_swap<uint64_t, Endian>(FilenamesRef);
+    return support::endian::byte_swap<uint64_t>(FilenamesRef, Endian);
   }
 
   /// Read the inline coverage mapping. Ignore the buffer parameter, it is for
@@ -1416,19 +1416,19 @@ struct CovMapHeader {
 #define COVMAP_HEADER(Type, LLVMType, Name, Init) Type Name;
 #include "llvm/ProfileData/InstrProfData.inc"
   template <llvm::endianness Endian> uint32_t getNRecords() const {
-    return support::endian::byte_swap<uint32_t, Endian>(NRecords);
+    return support::endian::byte_swap<uint32_t>(NRecords, Endian);
   }
 
   template <llvm::endianness Endian> uint32_t getFilenamesSize() const {
-    return support::endian::byte_swap<uint32_t, Endian>(FilenamesSize);
+    return support::endian::byte_swap<uint32_t>(FilenamesSize, Endian);
   }
 
   template <llvm::endianness Endian> uint32_t getCoverageSize() const {
-    return support::endian::byte_swap<uint32_t, Endian>(CoverageSize);
+    return support::endian::byte_swap<uint32_t>(CoverageSize, Endian);
   }
 
   template <llvm::endianness Endian> uint32_t getVersion() const {
-    return support::endian::byte_swap<uint32_t, Endian>(Version);
+    return support::endian::byte_swap<uint32_t>(Version, Endian);
   }
 };
 
diff --git a/llvm/include/llvm/Support/Endian.h b/llvm/include/llvm/Support/Endian.h
index 6c86feb..51db225 100644
--- a/llvm/include/llvm/Support/Endian.h
+++ b/llvm/include/llvm/Support/Endian.h
@@ -49,7 +49,9 @@ template <typename value_type>
 
 /// Swap the bytes of value to match the given endianness.
 template <typename value_type, endianness endian>
-[[nodiscard]] inline value_type byte_swap(value_type value) {
+[[nodiscard]]
+LLVM_DEPRECATED("Pass endian as a function argument instead",
+                "byte_swap") inline value_type byte_swap(value_type value) {
   return byte_swap(value, endian);
 }
 
@@ -137,8 +139,8 @@ template <typename value_type, endianness endian, std::size_t alignment>
            LLVM_ASSUME_ALIGNED(
                memory, (detail::PickAlignment<value_type, alignment>::value)),
            sizeof(value_type) * 2);
-    val[0] = byte_swap<value_type, endian>(val[0]);
-    val[1] = byte_swap<value_type, endian>(val[1]);
+    val[0] = byte_swap<value_type>(val[0], endian);
+    val[1] = byte_swap<value_type>(val[1], endian);
 
     // Shift bits from the lower value into place.
     make_unsigned_t<value_type> lowerVal = val[0] >> startBit;
@@ -172,8 +174,8 @@ inline void writeAtBitAlignment(void *memory, value_type value,
            LLVM_ASSUME_ALIGNED(
                memory, (detail::PickAlignment<value_type, alignment>::value)),
            sizeof(value_type) * 2);
-    val[0] = byte_swap<value_type, endian>(val[0]);
-    val[1] = byte_swap<value_type, endian>(val[1]);
+    val[0] = byte_swap<value_type>(val[0], endian);
+    val[1] = byte_swap<value_type>(val[1], endian);
 
     // Mask off any existing bits in the upper part of the lower value that
     // we want to replace.
@@ -201,8 +203,8 @@ inline void writeAtBitAlignment(void *memory, value_type value,
     val[1] |= upperVal;
 
     // Finally, rewrite values.
-    val[0] = byte_swap<value_type, endian>(val[0]);
-    val[1] = byte_swap<value_type, endian>(val[1]);
+    val[0] = byte_swap<value_type>(val[0], endian);
+    val[1] = byte_swap<value_type>(val[1], endian);
     memcpy(LLVM_ASSUME_ALIGNED(
                memory, (detail::PickAlignment<value_type, alignment>::value)),
            &val[0], sizeof(value_type) * 2);
diff --git a/llvm/include/llvm/Support/FileCollector.h b/llvm/include/llvm/Support/FileCollector.h
index 9cc6776..9fa11ba 100644
--- a/llvm/include/llvm/Support/FileCollector.h
+++ b/llvm/include/llvm/Support/FileCollector.h
@@ -81,6 +81,9 @@ public:
     /// Canonicalize a pair of virtual and real paths.
     LLVM_ABI PathStorage canonicalize(StringRef SrcPath);
 
+    /// Return the underlying file system.
+    vfs::FileSystem &getFileSystem() const { return *VFS; };
+
     explicit PathCanonicalizer(IntrusiveRefCntPtr<vfs::FileSystem> VFS)
         : VFS(std::move(VFS)) {}
 
diff --git a/llvm/include/llvm/Support/SipHash.h b/llvm/include/llvm/Support/SipHash.h
index 910cf594..b090565 100644
--- a/llvm/include/llvm/Support/SipHash.h
+++ b/llvm/include/llvm/Support/SipHash.h
@@ -33,6 +33,13 @@ LLVM_ABI void getSipHash_2_4_64(ArrayRef<uint8_t> In, const uint8_t (&K)[16],
 LLVM_ABI void getSipHash_2_4_128(ArrayRef<uint8_t> In, const uint8_t (&K)[16],
                                  uint8_t (&Out)[16]);
 
+/// Compute a stable 64-bit hash of the given string.
+///
+/// The exact algorithm is the little-endian interpretation of the
+/// non-doubled (i.e. 64-bit) result of applying a SipHash-2-4 using
+/// a specific seed value which can be found in the source.
+LLVM_ABI uint64_t getStableSipHash(StringRef Str);
+
 /// Compute a stable non-zero 16-bit hash of the given string.
 ///
 /// The exact algorithm is the little-endian interpretation of the
diff --git a/llvm/include/llvm/Support/TrailingObjects.h b/llvm/include/llvm/Support/TrailingObjects.h
index d7211a9..3eb7c0b 100644
--- a/llvm/include/llvm/Support/TrailingObjects.h
+++ b/llvm/include/llvm/Support/TrailingObjects.h
@@ -57,25 +57,9 @@
 namespace llvm {
 
 namespace trailing_objects_internal {
-/// Helper template to calculate the max alignment requirement for a set of
-/// objects.
-template <typename First, typename... Rest> class AlignmentCalcHelper {
-private:
-  enum {
-    FirstAlignment = alignof(First),
-    RestAlignment = AlignmentCalcHelper<Rest...>::Alignment,
-  };
 
-public:
-  enum {
-    Alignment = FirstAlignment > RestAlignment ? FirstAlignment : RestAlignment
-  };
-};
-
-template <typename First> class AlignmentCalcHelper<First> {
-public:
-  enum { Alignment = alignof(First) };
-};
+template <typename... T>
+inline constexpr size_t MaxAlignment = std::max({alignof(T)...});
 
 /// The base class for TrailingObjects* classes.
 class TrailingObjectsBase {
@@ -209,11 +193,10 @@ protected:
 /// See the file comment for details on the usage of the
 /// TrailingObjects type.
 template <typename BaseTy, typename... TrailingTys>
-class TrailingObjects : private trailing_objects_internal::TrailingObjectsImpl<
-                            trailing_objects_internal::AlignmentCalcHelper<
-                                TrailingTys...>::Alignment,
-                            BaseTy, TrailingObjects<BaseTy, TrailingTys...>,
-                            BaseTy, TrailingTys...> {
+class TrailingObjects
+    : private trailing_objects_internal::TrailingObjectsImpl<
+          trailing_objects_internal::MaxAlignment<TrailingTys...>, BaseTy,
+          TrailingObjects<BaseTy, TrailingTys...>, BaseTy, TrailingTys...> {
 
   template <int A, typename B, typename T, typename P, typename... M>
   friend class trailing_objects_internal::TrailingObjectsImpl;
@@ -221,8 +204,8 @@ class TrailingObjects : private trailing_objects_internal::TrailingObjectsImpl<
   template <typename... Tys> class Foo {};
 
   typedef trailing_objects_internal::TrailingObjectsImpl<
-      trailing_objects_internal::AlignmentCalcHelper<TrailingTys...>::Alignment,
-      BaseTy, TrailingObjects<BaseTy, TrailingTys...>, BaseTy, TrailingTys...>
+      trailing_objects_internal::MaxAlignment<TrailingTys...>, BaseTy,
+      TrailingObjects<BaseTy, TrailingTys...>, BaseTy, TrailingTys...>
       ParentType;
   using TrailingObjectsBase = trailing_objects_internal::TrailingObjectsBase;
 
diff --git a/llvm/include/llvm/TextAPI/SymbolSet.h b/llvm/include/llvm/TextAPI/SymbolSet.h
index 42c411a..22f4124 100644
--- a/llvm/include/llvm/TextAPI/SymbolSet.h
+++ b/llvm/include/llvm/TextAPI/SymbolSet.h
@@ -92,6 +92,8 @@ private:
 
 public:
   SymbolSet() = default;
+  SymbolSet(const SymbolSet &other) = delete;
+  SymbolSet &operator=(const SymbolSet &other) = delete;
   LLVM_ABI ~SymbolSet();
   LLVM_ABI Symbol *addGlobal(EncodeKind Kind, StringRef Name, SymbolFlags Flags,
                              const Target &Targ);
diff --git a/llvm/include/llvm/Transforms/InstCombine/InstCombiner.h b/llvm/include/llvm/Transforms/InstCombine/InstCombiner.h
index fa313f5..d6c2d7f 100644
--- a/llvm/include/llvm/Transforms/InstCombine/InstCombiner.h
+++ b/llvm/include/llvm/Transforms/InstCombine/InstCombiner.h
@@ -64,6 +64,8 @@ protected:
   /// A worklist of the instructions that need to be simplified.
   InstructionWorklist &Worklist;
 
+  Function &F;
+
   // Mode in which we are running the combiner.
   const bool MinimizeSize;
 
@@ -98,17 +100,17 @@ protected:
   bool ComputedBackEdges = false;
 
 public:
-  InstCombiner(InstructionWorklist &Worklist, BuilderTy &Builder,
-               bool MinimizeSize, AAResults *AA, AssumptionCache &AC,
-               TargetLibraryInfo &TLI, TargetTransformInfo &TTI,
-               DominatorTree &DT, OptimizationRemarkEmitter &ORE,
-               BlockFrequencyInfo *BFI, BranchProbabilityInfo *BPI,
-               ProfileSummaryInfo *PSI, const DataLayout &DL,
+  InstCombiner(InstructionWorklist &Worklist, BuilderTy &Builder, Function &F,
+               AAResults *AA, AssumptionCache &AC, TargetLibraryInfo &TLI,
+               TargetTransformInfo &TTI, DominatorTree &DT,
+               OptimizationRemarkEmitter &ORE, BlockFrequencyInfo *BFI,
+               BranchProbabilityInfo *BPI, ProfileSummaryInfo *PSI,
+               const DataLayout &DL,
                ReversePostOrderTraversal<BasicBlock *> &RPOT)
       : TTIForTargetIntrinsicsOnly(TTI), Builder(Builder), Worklist(Worklist),
-        MinimizeSize(MinimizeSize), AA(AA), AC(AC), TLI(TLI), DT(DT), DL(DL),
-        SQ(DL, &TLI, &DT, &AC, nullptr, /*UseInstrInfo*/ true,
-           /*CanUseUndef*/ true, &DC),
+        F(F), MinimizeSize(F.hasMinSize()), AA(AA), AC(AC), TLI(TLI), DT(DT),
+        DL(DL), SQ(DL, &TLI, &DT, &AC, nullptr, /*UseInstrInfo*/ true,
+                   /*CanUseUndef*/ true, &DC),
         ORE(ORE), BFI(BFI), BPI(BPI), PSI(PSI), RPOT(RPOT) {}
 
   virtual ~InstCombiner() = default;
diff --git a/llvm/include/llvm/Transforms/Instrumentation/GCOVProfiler.h b/llvm/include/llvm/Transforms/Instrumentation/GCOVProfiler.h
index 4d3ead29..f53cee2 100644
--- a/llvm/include/llvm/Transforms/Instrumentation/GCOVProfiler.h
+++ b/llvm/include/llvm/Transforms/Instrumentation/GCOVProfiler.h
@@ -20,11 +20,15 @@ namespace llvm {
 /// The gcov-style instrumentation pass
 class GCOVProfilerPass : public PassInfoMixin<GCOVProfilerPass> {
 public:
-  GCOVProfilerPass(const GCOVOptions &Options = GCOVOptions::getDefault()) : GCOVOpts(Options) { }
+  GCOVProfilerPass(
+      const GCOVOptions &Options = GCOVOptions::getDefault(),
+      IntrusiveRefCntPtr<vfs::FileSystem> VFS = vfs::getRealFileSystem())
+      : GCOVOpts(Options), VFS(std::move(VFS)) {}
   LLVM_ABI PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
 
 private:
   GCOVOptions GCOVOpts;
+  IntrusiveRefCntPtr<vfs::FileSystem> VFS;
 };
 
 } // namespace llvm
diff --git a/llvm/lib/Analysis/LazyValueInfo.cpp b/llvm/lib/Analysis/LazyValueInfo.cpp
index 90bae77..6fb2807 100644
--- a/llvm/lib/Analysis/LazyValueInfo.cpp
+++ b/llvm/lib/Analysis/LazyValueInfo.cpp
@@ -59,6 +59,11 @@ INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
 INITIALIZE_PASS_END(LazyValueInfoWrapperPass, "lazy-value-info",
                 "Lazy Value Information Analysis", false, true)
 
+static cl::opt<bool> PerPredRanges(
+    "lvi-per-pred-ranges", cl::Hidden, cl::init(false),
+    cl::desc("Enable tracking of ranges for a value in a block for"
+             "each block predecessor (default = false)"));
+
 namespace llvm {
 FunctionPass *createLazyValueInfoPass() {
   return new LazyValueInfoWrapperPass();
@@ -103,6 +108,10 @@ namespace {
 
 namespace {
 using NonNullPointerSet = SmallDenseSet<AssertingVH<Value>, 2>;
+using BBLatticeElementMap =
+    SmallDenseMap<PoisoningVH<BasicBlock>, ValueLatticeElement, 4>;
+using PredecessorValueLatticeMap =
+    SmallDenseMap<AssertingVH<Value>, BBLatticeElementMap, 2>;
 
 /// This is the cache kept by LazyValueInfo which
 /// maintains information about queries across the clients' queries.
@@ -117,6 +126,10 @@ class LazyValueInfoCache {
     // std::nullopt indicates that the nonnull pointers for this basic block
     // block have not been computed yet.
     std::optional<NonNullPointerSet> NonNullPointers;
+    // This is an extension of the above LatticeElements, caching, for each
+    // Value, a ValueLatticeElement, for each predecessor of the BB tracked by
+    // this entry.
+    std::optional<PredecessorValueLatticeMap> PredecessorLatticeElements;
   };
 
   /// Cached information per basic block.
@@ -134,8 +147,14 @@ class LazyValueInfoCache {
 
   BlockCacheEntry *getOrCreateBlockEntry(BasicBlock *BB) {
     auto It = BlockCache.find_as(BB);
-    if (It == BlockCache.end())
-      It = BlockCache.insert({BB, std::make_unique<BlockCacheEntry>()}).first;
+    if (It == BlockCache.end()) {
+      std::unique_ptr<BlockCacheEntry> BCE =
+          std::make_unique<BlockCacheEntry>();
+      if (PerPredRanges)
+        BCE->PredecessorLatticeElements =
+            std::make_optional<PredecessorValueLatticeMap>();
+      It = BlockCache.insert({BB, std::move(BCE)}).first;
+    }
 
     return It->second.get();
   }
@@ -161,6 +180,28 @@ public:
     addValueHandle(Val);
   }
 
+  void insertPredecessorResults(Value *Val, BasicBlock *BB,
+                                BBLatticeElementMap &PredLatticeElements) {
+    BlockCacheEntry *Entry = getOrCreateBlockEntry(BB);
+
+    Entry->PredecessorLatticeElements->insert({Val, PredLatticeElements});
+
+    addValueHandle(Val);
+  }
+
+  std::optional<BBLatticeElementMap>
+  getCachedPredecessorInfo(Value *V, BasicBlock *BB) const {
+    const BlockCacheEntry *Entry = getBlockEntry(BB);
+    if (!Entry)
+      return std::nullopt;
+
+    auto LatticeIt = Entry->PredecessorLatticeElements->find_as(V);
+    if (LatticeIt == Entry->PredecessorLatticeElements->end())
+      return std::nullopt;
+
+    return LatticeIt->second;
+  }
+
   std::optional<ValueLatticeElement> getCachedValueInfo(Value *V,
                                                         BasicBlock *BB) const {
     const BlockCacheEntry *Entry = getBlockEntry(BB);
@@ -216,6 +257,8 @@ void LazyValueInfoCache::eraseValue(Value *V) {
     Pair.second->OverDefined.erase(V);
     if (Pair.second->NonNullPointers)
       Pair.second->NonNullPointers->erase(V);
+    if (PerPredRanges)
+      Pair.second->PredecessorLatticeElements->erase(V);
   }
 
   auto HandleIt = ValueHandles.find_as(V);
@@ -230,6 +273,10 @@ void LVIValueHandle::deleted() {
 }
 
 void LazyValueInfoCache::eraseBlock(BasicBlock *BB) {
+  // Clear all when a BB is removed.
+  if (PerPredRanges)
+    for (auto &Pair : BlockCache)
+      Pair.second->PredecessorLatticeElements->clear();
   BlockCache.erase(BB);
 }
 
@@ -691,6 +738,9 @@ LazyValueInfoImpl::solveBlockValueNonLocal(Value *Val, BasicBlock *BB) {
   // find a path to function entry.  TODO: We should consider explicitly
   // canonicalizing to make this true rather than relying on this happy
   // accident.
+  std::optional<BBLatticeElementMap> PredLatticeElements;
+  if (PerPredRanges)
+    PredLatticeElements = std::make_optional<BBLatticeElementMap>();
   for (BasicBlock *Pred : predecessors(BB)) {
     // Skip self loops.
     if (Pred == BB)
@@ -710,8 +760,13 @@ LazyValueInfoImpl::solveBlockValueNonLocal(Value *Val, BasicBlock *BB) {
                         << Pred->getName() << "' (non local).\n");
       return Result;
     }
+    if (PerPredRanges)
+      PredLatticeElements->insert({Pred, *EdgeResult});
   }
 
+  if (PerPredRanges)
+    TheCache.insertPredecessorResults(Val, BB, *PredLatticeElements);
+
   // Return the merged value, which is more precise than 'overdefined'.
   assert(!Result.isOverdefined());
   return Result;
@@ -724,6 +779,9 @@ LazyValueInfoImpl::solveBlockValuePHINode(PHINode *PN, BasicBlock *BB) {
   // Loop over all of our predecessors, merging what we know from them into
   // result.  See the comment about the chosen traversal order in
   // solveBlockValueNonLocal; the same reasoning applies here.
+  std::optional<BBLatticeElementMap> PredLatticeElements;
+  if (PerPredRanges)
+    PredLatticeElements = std::make_optional<BBLatticeElementMap>();
   for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
     BasicBlock *PhiBB = PN->getIncomingBlock(i);
     Value *PhiVal = PN->getIncomingValue(i);
@@ -746,8 +804,14 @@ LazyValueInfoImpl::solveBlockValuePHINode(PHINode *PN, BasicBlock *BB) {
 
       return Result;
     }
+
+    if (PerPredRanges)
+      PredLatticeElements->insert({PhiBB, *EdgeResult});
   }
 
+  if (PerPredRanges)
+    TheCache.insertPredecessorResults(PN, BB, *PredLatticeElements);
+
   // Return the merged value, which is more precise than 'overdefined'.
   assert(!Result.isOverdefined() && "Possible PHI in entry block?");
   return Result;
@@ -1002,7 +1066,77 @@ LazyValueInfoImpl::solveBlockValueBinaryOpImpl(
 
   const ConstantRange &LHSRange = *LHSRes;
   const ConstantRange &RHSRange = *RHSRes;
-  return ValueLatticeElement::getRange(OpFn(LHSRange, RHSRange));
+
+  std::optional<ValueLatticeElement> MergedResult =
+      ValueLatticeElement::getRange(OpFn(LHSRange, RHSRange));
+
+  if (!PerPredRanges)
+    return MergedResult;
+
+  std::optional<BBLatticeElementMap> PredLHS =
+      TheCache.getCachedPredecessorInfo(LHS, BB);
+  if (!PredLHS)
+    return MergedResult;
+  std::optional<BBLatticeElementMap> PredRHS =
+      TheCache.getCachedPredecessorInfo(RHS, BB);
+  if (!PredRHS)
+    return MergedResult;
+
+  const BBLatticeElementMap &LHSPredMap = *PredLHS;
+  const BBLatticeElementMap &RHSPredMap = *PredRHS;
+
+  BBLatticeElementMap PredLatticeElements;
+  ValueLatticeElement OverallPredResult;
+  for (auto *Pred : predecessors(BB)) {
+    auto LHSIt = LHSPredMap.find_as(Pred);
+    if (LHSIt == LHSPredMap.end())
+      return MergedResult;
+    const ValueLatticeElement &LHSFromPred = LHSIt->second;
+    std::optional<ConstantRange> LHSFromPredRes =
+        LHSFromPred.asConstantRange(LHS->getType());
+    if (!LHSFromPredRes)
+      return MergedResult;
+
+    auto RHSIt = RHSPredMap.find_as(Pred);
+    if (RHSIt == RHSPredMap.end())
+      return MergedResult;
+    const ValueLatticeElement &RHSFromPred = RHSIt->second;
+    std::optional<ConstantRange> RHSFromPredRes =
+        RHSFromPred.asConstantRange(RHS->getType());
+    if (!RHSFromPredRes)
+      return MergedResult;
+
+    const ConstantRange &LHSFromPredRange = *LHSFromPredRes;
+    const ConstantRange &RHSFromPredRange = *RHSFromPredRes;
+    std::optional<ValueLatticeElement> PredResult =
+        ValueLatticeElement::getRange(OpFn(LHSFromPredRange, RHSFromPredRange));
+    if (!PredResult)
+      return MergedResult;
+    if (PredResult->isOverdefined()) {
+      LLVM_DEBUG(
+          dbgs() << " pred BB '" << Pred->getName() << "' for BB '"
+                 << BB->getName()
+                 << "' overdefined. Discarding all predecessor intervals.\n");
+      return MergedResult;
+    }
+    PredLatticeElements.insert({Pred, *PredResult});
+    OverallPredResult.mergeIn(*PredResult);
+  }
+
+  // If this point is reached, all predecessors for both LHS and RHS have
+  // constant ranges previously computed. Can cache result and use the
+  // OverallPredResult;
+  TheCache.insertPredecessorResults(I, BB, PredLatticeElements);
+
+  LLVM_DEBUG(dbgs() << " Using predecessor intervals, evaluated " << *I
+                    << " to: " << OverallPredResult << ".\n");
+
+  if (!MergedResult)
+    return OverallPredResult;
+
+  LLVM_DEBUG(dbgs() << " Intersecting intervals for " << *I << ": "
+                    << OverallPredResult << " and  " << MergedResult << ".\n");
+  return MergedResult->intersect(OverallPredResult);
 }
 
 std::optional<ValueLatticeElement>
diff --git a/llvm/lib/CAS/CMakeLists.txt b/llvm/lib/CAS/CMakeLists.txt
index 6ed724b..7ae5f7e 100644
--- a/llvm/lib/CAS/CMakeLists.txt
+++ b/llvm/lib/CAS/CMakeLists.txt
@@ -2,14 +2,19 @@ add_llvm_component_library(LLVMCAS
   ActionCache.cpp
   ActionCaches.cpp
   BuiltinCAS.cpp
+  DatabaseFile.cpp
   InMemoryCAS.cpp
   MappedFileRegionArena.cpp
   ObjectStore.cpp
   OnDiskCommon.cpp
+  OnDiskTrieRawHashMap.cpp
 
   ADDITIONAL_HEADER_DIRS
   ${LLVM_MAIN_INCLUDE_DIR}/llvm/CAS
 
+  LINK_LIBS
+  ${LLVM_PTHREAD_LIB}
+
   LINK_COMPONENTS
   Support
 )
diff --git a/llvm/lib/CAS/DatabaseFile.cpp b/llvm/lib/CAS/DatabaseFile.cpp
new file mode 100644
index 0000000..db8ce1d
--- /dev/null
+++ b/llvm/lib/CAS/DatabaseFile.cpp
@@ -0,0 +1,123 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file This file implements the common abstractions for CAS database file.
+///
+//===----------------------------------------------------------------------===//
+
+#include "DatabaseFile.h"
+
+using namespace llvm;
+using namespace llvm::cas;
+using namespace llvm::cas::ondisk;
+
+Error ondisk::createTableConfigError(std::errc ErrC, StringRef Path,
+                                     StringRef TableName, const Twine &Msg) {
+  return createStringError(make_error_code(ErrC),
+                           Path + "[" + TableName + "]: " + Msg);
+}
+
+Error ondisk::checkTable(StringRef Label, size_t Expected, size_t Observed,
+                         StringRef Path, StringRef TrieName) {
+  if (Expected == Observed)
+    return Error::success();
+  return createTableConfigError(std::errc::invalid_argument, Path, TrieName,
+                                "mismatched " + Label +
+                                    " (expected: " + Twine(Expected) +
+                                    ", observed: " + Twine(Observed) + ")");
+}
+
+Expected<DatabaseFile>
+DatabaseFile::create(const Twine &Path, uint64_t Capacity,
+                     function_ref<Error(DatabaseFile &)> NewDBConstructor) {
+  // Constructor for if the file doesn't exist.
+  auto NewFileConstructor = [&](MappedFileRegionArena &Alloc) -> Error {
+    if (Alloc.capacity() <
+        sizeof(Header) + sizeof(MappedFileRegionArena::Header))
+      return createTableConfigError(std::errc::argument_out_of_domain,
+                                    Path.str(), "datafile",
+                                    "Allocator too small for header");
+    (void)new (Alloc.data()) Header{getMagic(), getVersion(), {0}};
+    DatabaseFile DB(Alloc);
+    return NewDBConstructor(DB);
+  };
+
+  // Get or create the file.
+  MappedFileRegionArena Alloc;
+  if (Error E = MappedFileRegionArena::create(Path, Capacity, sizeof(Header),
+                                              NewFileConstructor)
+                    .moveInto(Alloc))
+    return std::move(E);
+
+  return DatabaseFile::get(
+      std::make_unique<MappedFileRegionArena>(std::move(Alloc)));
+}
+
+Error DatabaseFile::addTable(TableHandle Table) {
+  assert(Table);
+  assert(&Table.getRegion() == &getRegion());
+  int64_t ExistingRootOffset = 0;
+  const int64_t NewOffset =
+      reinterpret_cast<const char *>(&Table.getHeader()) - getRegion().data();
+  if (H->RootTableOffset.compare_exchange_strong(ExistingRootOffset, NewOffset))
+    return Error::success();
+
+  // Silently ignore attempts to set the root to itself.
+  if (ExistingRootOffset == NewOffset)
+    return Error::success();
+
+  // Return an proper error message.
+  TableHandle Root(getRegion(), ExistingRootOffset);
+  if (Root.getName() == Table.getName())
+    return createStringError(
+        make_error_code(std::errc::not_supported),
+        "collision with existing table of the same name '" + Table.getName() +
+            "'");
+
+  return createStringError(make_error_code(std::errc::not_supported),
+                           "cannot add new table '" + Table.getName() +
+                               "'"
+                               " to existing root '" +
+                               Root.getName() + "'");
+}
+
+std::optional<TableHandle> DatabaseFile::findTable(StringRef Name) {
+  int64_t RootTableOffset = H->RootTableOffset.load();
+  if (!RootTableOffset)
+    return std::nullopt;
+
+  TableHandle Root(getRegion(), RootTableOffset);
+  if (Root.getName() == Name)
+    return Root;
+
+  return std::nullopt;
+}
+
+Error DatabaseFile::validate(MappedFileRegion &Region) {
+  if (Region.size() < sizeof(Header))
+    return createStringError(std::errc::invalid_argument,
+                             "database: missing header");
+
+  // Check the magic and version.
+  auto *H = reinterpret_cast<Header *>(Region.data());
+  if (H->Magic != getMagic())
+    return createStringError(std::errc::invalid_argument,
+                             "database: bad magic");
+  if (H->Version != getVersion())
+    return createStringError(std::errc::invalid_argument,
+                             "database: wrong version");
+
+  auto *MFH = reinterpret_cast<MappedFileRegionArena::Header *>(Region.data() +
+                                                                sizeof(Header));
+  // Check the bump-ptr, which should point past the header.
+  if (MFH->BumpPtr.load() < (int64_t)sizeof(Header))
+    return createStringError(std::errc::invalid_argument,
+                             "database: corrupt bump-ptr");
+
+  return Error::success();
+}
diff --git a/llvm/lib/CAS/DatabaseFile.h b/llvm/lib/CAS/DatabaseFile.h
new file mode 100644
index 0000000..609e5f13
--- /dev/null
+++ b/llvm/lib/CAS/DatabaseFile.h
@@ -0,0 +1,153 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This file declares the common interface for a DatabaseFile that is used to
+/// implement OnDiskCAS.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_CAS_DATABASEFILE_H
+#define LLVM_LIB_CAS_DATABASEFILE_H
+
+#include "llvm/ADT/StringRef.h"
+#include "llvm/CAS/MappedFileRegionArena.h"
+#include "llvm/Support/Error.h"
+
+namespace llvm::cas::ondisk {
+
+using MappedFileRegion = MappedFileRegionArena::RegionT;
+
+/// Generic handle for a table.
+///
+/// Generic table header layout:
+/// - 2-bytes: TableKind
+/// - 2-bytes: TableNameSize
+/// - 4-bytes: TableNameRelOffset (relative to header)
+class TableHandle {
+public:
+  enum class TableKind : uint16_t {
+    TrieRawHashMap = 1,
+    DataAllocator = 2,
+  };
+  struct Header {
+    TableKind Kind;
+    uint16_t NameSize;
+    int32_t NameRelOffset; ///< Relative to Header.
+  };
+
+  explicit operator bool() const { return H; }
+  const Header &getHeader() const { return *H; }
+  MappedFileRegion &getRegion() const { return *Region; }
+
+  template <class T> static void check() {
+    static_assert(
+        std::is_same<decltype(T::Header::GenericHeader), Header>::value,
+        "T::GenericHeader should be of type TableHandle::Header");
+    static_assert(offsetof(typename T::Header, GenericHeader) == 0,
+                  "T::GenericHeader must be the head of T::Header");
+  }
+  template <class T> bool is() const { return T::Kind == H->Kind; }
+  template <class T> T dyn_cast() const {
+    check<T>();
+    if (is<T>())
+      return T(*Region, *reinterpret_cast<typename T::Header *>(H));
+    return T();
+  }
+  template <class T> T cast() const {
+    assert(is<T>());
+    return dyn_cast<T>();
+  }
+
+  StringRef getName() const {
+    auto *Begin = reinterpret_cast<const char *>(H) + H->NameRelOffset;
+    return StringRef(Begin, H->NameSize);
+  }
+
+  TableHandle() = default;
+  TableHandle(MappedFileRegion &Region, Header &H) : Region(&Region), H(&H) {}
+  TableHandle(MappedFileRegion &Region, intptr_t HeaderOffset)
+      : TableHandle(Region,
+                    *reinterpret_cast<Header *>(Region.data() + HeaderOffset)) {
+  }
+
+private:
+  MappedFileRegion *Region = nullptr;
+  Header *H = nullptr;
+};
+
+/// Encapsulate a database file, which:
+/// - Sets/checks magic.
+/// - Sets/checks version.
+/// - Points at an arbitrary root table.
+/// - Sets up a MappedFileRegionArena for allocation.
+///
+/// Top-level layout:
+/// - 4-bytes: Magic
+/// - 4-bytes: Version
+/// - 8-bytes: RootTableOffset (16-bits: Kind; 48-bits: Offset)
+/// - 8-bytes: BumpPtr from MappedFileRegionArena
+class DatabaseFile {
+public:
+  static constexpr uint32_t getMagic() { return 0xDA7ABA53UL; }
+  static constexpr uint32_t getVersion() { return 1UL; }
+  struct Header {
+    uint32_t Magic;
+    uint32_t Version;
+    std::atomic<int64_t> RootTableOffset;
+  };
+
+  const Header &getHeader() { return *H; }
+  MappedFileRegionArena &getAlloc() { return Alloc; }
+  MappedFileRegion &getRegion() { return Alloc.getRegion(); }
+
+  /// Add a table. This is currently not thread safe and should be called inside
+  /// NewDBConstructor.
+  Error addTable(TableHandle Table);
+
+  /// Find a table. May return null.
+  std::optional<TableHandle> findTable(StringRef Name);
+
+  /// Create the DatabaseFile at Path with Capacity.
+  static Expected<DatabaseFile>
+  create(const Twine &Path, uint64_t Capacity,
+         function_ref<Error(DatabaseFile &)> NewDBConstructor);
+
+  size_t size() const { return Alloc.size(); }
+
+private:
+  static Expected<DatabaseFile>
+  get(std::unique_ptr<MappedFileRegionArena> Alloc) {
+    if (Error E = validate(Alloc->getRegion()))
+      return std::move(E);
+    return DatabaseFile(std::move(Alloc));
+  }
+
+  static Error validate(MappedFileRegion &Region);
+
+  DatabaseFile(MappedFileRegionArena &Alloc)
+      : H(reinterpret_cast<Header *>(Alloc.data())), Alloc(Alloc) {}
+  DatabaseFile(std::unique_ptr<MappedFileRegionArena> Alloc)
+      : DatabaseFile(*Alloc) {
+    OwnedAlloc = std::move(Alloc);
+  }
+
+  Header *H = nullptr;
+  MappedFileRegionArena &Alloc;
+  std::unique_ptr<MappedFileRegionArena> OwnedAlloc;
+};
+
+Error createTableConfigError(std::errc ErrC, StringRef Path,
+                             StringRef TableName, const Twine &Msg);
+
+Error checkTable(StringRef Label, size_t Expected, size_t Observed,
+                 StringRef Path, StringRef TrieName);
+
+} // namespace llvm::cas::ondisk
+
+#endif
diff --git a/llvm/lib/CAS/InMemoryCAS.cpp b/llvm/lib/CAS/InMemoryCAS.cpp
index 255b89c..c63ee70d 100644
--- a/llvm/lib/CAS/InMemoryCAS.cpp
+++ b/llvm/lib/CAS/InMemoryCAS.cpp
@@ -57,6 +57,9 @@ public:
   InMemoryObject() = delete;
   InMemoryObject(InMemoryObject &&) = delete;
   InMemoryObject(const InMemoryObject &) = delete;
+  InMemoryObject &operator=(const InMemoryObject &) = delete;
+  InMemoryObject &operator=(InMemoryObject &&) = delete;
+  virtual ~InMemoryObject() = default;
 
 protected:
   InMemoryObject(Kind K, const InMemoryIndexValueT &I) : IndexAndKind(&I, K) {}
diff --git a/llvm/lib/CAS/OnDiskTrieRawHashMap.cpp b/llvm/lib/CAS/OnDiskTrieRawHashMap.cpp
new file mode 100644
index 0000000..9b382dd7
--- /dev/null
+++ b/llvm/lib/CAS/OnDiskTrieRawHashMap.cpp
@@ -0,0 +1,1178 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file Implements OnDiskTrieRawHashMap.
+///
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CAS/OnDiskTrieRawHashMap.h"
+#include "DatabaseFile.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/TrieHashIndexGenerator.h"
+#include "llvm/CAS/MappedFileRegionArena.h"
+#include "llvm/Config/llvm-config.h"
+#include "llvm/Support/ThreadPool.h"
+#include "llvm/Support/Threading.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+using namespace llvm::cas;
+using namespace llvm::cas::ondisk;
+
+#if LLVM_ENABLE_ONDISK_CAS
+
+//===----------------------------------------------------------------------===//
+// TrieRawHashMap data structures.
+//===----------------------------------------------------------------------===//
+
+namespace {
+
+class SubtrieHandle;
+class TrieRawHashMapHandle;
+class TrieVisitor;
+
+/// A value stored in the slots inside a SubTrie. A stored value can either be a
+/// subtrie (encoded after negation) which is the file offset to another
+/// subtrie, or it can be a fileset to a DataRecord.
+class SubtrieSlotValue {
+public:
+  explicit operator bool() const { return !isEmpty(); }
+  bool isEmpty() const { return !Offset; }
+  bool isData() const { return Offset > 0; }
+  bool isSubtrie() const { return Offset < 0; }
+  uint64_t asData() const {
+    assert(isData());
+    return Offset;
+  }
+  uint64_t asSubtrie() const {
+    assert(isSubtrie());
+    return -Offset;
+  }
+
+  FileOffset asSubtrieFileOffset() const { return FileOffset(asSubtrie()); }
+
+  FileOffset asDataFileOffset() const { return FileOffset(asData()); }
+
+  int64_t getRawOffset() const { return Offset; }
+
+  static SubtrieSlotValue getDataOffset(int64_t Offset) {
+    return SubtrieSlotValue(Offset);
+  }
+
+  static SubtrieSlotValue getSubtrieOffset(int64_t Offset) {
+    return SubtrieSlotValue(-Offset);
+  }
+
+  static SubtrieSlotValue getDataOffset(FileOffset Offset) {
+    return getDataOffset(Offset.get());
+  }
+
+  static SubtrieSlotValue getSubtrieOffset(FileOffset Offset) {
+    return getDataOffset(Offset.get());
+  }
+
+  static SubtrieSlotValue getFromSlot(std::atomic<int64_t> &Slot) {
+    return SubtrieSlotValue(Slot.load());
+  }
+
+  SubtrieSlotValue() = default;
+
+private:
+  friend class SubtrieHandle;
+  explicit SubtrieSlotValue(int64_t Offset) : Offset(Offset) {}
+  int64_t Offset = 0;
+};
+
+/// Subtrie layout:
+/// - 2-bytes: StartBit
+/// - 1-bytes: NumBits=lg(num-slots)
+/// - 5-bytes: 0-pad
+/// - <slots>
+class SubtrieHandle {
+public:
+  struct Header {
+    /// The bit this subtrie starts on.
+    uint16_t StartBit;
+
+    /// The number of bits this subtrie handles. It has 2^NumBits slots.
+    uint8_t NumBits;
+
+    /// 0-pad to 8B.
+    uint8_t ZeroPad1B;
+    uint32_t ZeroPad4B;
+  };
+
+  /// Slot storage:
+  /// - zero:     Empty
+  /// - positive: RecordOffset
+  /// - negative: SubtrieOffset
+  using SlotT = std::atomic<int64_t>;
+
+  static int64_t getSlotsSize(uint32_t NumBits) {
+    return sizeof(int64_t) * (1u << NumBits);
+  }
+
+  static int64_t getSize(uint32_t NumBits) {
+    return sizeof(SubtrieHandle::Header) + getSlotsSize(NumBits);
+  }
+
+  int64_t getSize() const { return getSize(H->NumBits); }
+  size_t getNumSlots() const { return Slots.size(); }
+
+  SubtrieSlotValue load(size_t I) const {
+    return SubtrieSlotValue(Slots[I].load());
+  }
+  void store(size_t I, SubtrieSlotValue V) {
+    return Slots[I].store(V.getRawOffset());
+  }
+
+  void printHash(raw_ostream &OS, ArrayRef<uint8_t> Bytes) const;
+
+  /// Return None on success, or the existing offset on failure.
+  bool compare_exchange_strong(size_t I, SubtrieSlotValue &Expected,
+                               SubtrieSlotValue New) {
+    return Slots[I].compare_exchange_strong(Expected.Offset, New.Offset);
+  }
+
+  /// Sink \p V from \p I in this subtrie down to \p NewI in a new subtrie with
+  /// \p NumSubtrieBits.
+  ///
+  /// \p UnusedSubtrie maintains a 1-item "free" list of unused subtries. If a
+  /// new subtrie is created that isn't used because of a lost race, then it If
+  /// it's already valid, it should be used instead of allocating a new one.
+  /// should be returned as an out parameter to be passed back in the future.
+  /// If it's already valid, it should be used instead of allocating a new one.
+  ///
+  /// Returns the subtrie that now lives at \p I.
+  Expected<SubtrieHandle> sink(size_t I, SubtrieSlotValue V,
+                               MappedFileRegionArena &Alloc,
+                               size_t NumSubtrieBits,
+                               SubtrieHandle &UnusedSubtrie, size_t NewI);
+
+  /// Only safe if the subtrie is empty.
+  void reinitialize(uint32_t StartBit, uint32_t NumBits);
+
+  SubtrieSlotValue getOffset() const {
+    return SubtrieSlotValue::getSubtrieOffset(
+        reinterpret_cast<const char *>(H) - Region->data());
+  }
+
+  FileOffset getFileOffset() const { return getOffset().asSubtrieFileOffset(); }
+
+  explicit operator bool() const { return H; }
+
+  Header &getHeader() const { return *H; }
+  uint32_t getStartBit() const { return H->StartBit; }
+  uint32_t getNumBits() const { return H->NumBits; }
+
+  static Expected<SubtrieHandle> create(MappedFileRegionArena &Alloc,
+                                        uint32_t StartBit, uint32_t NumBits);
+
+  static SubtrieHandle getFromFileOffset(MappedFileRegion &Region,
+                                         FileOffset Offset) {
+    return SubtrieHandle(Region, SubtrieSlotValue::getSubtrieOffset(Offset));
+  }
+
+  SubtrieHandle() = default;
+  SubtrieHandle(MappedFileRegion &Region, Header &H)
+      : Region(&Region), H(&H), Slots(getSlots(H)) {}
+  SubtrieHandle(MappedFileRegion &Region, SubtrieSlotValue Offset)
+      : SubtrieHandle(Region, *reinterpret_cast<Header *>(
+                                  Region.data() + Offset.asSubtrie())) {}
+
+private:
+  MappedFileRegion *Region = nullptr;
+  Header *H = nullptr;
+  MutableArrayRef<SlotT> Slots;
+
+  static MutableArrayRef<SlotT> getSlots(Header &H) {
+    return MutableArrayRef(reinterpret_cast<SlotT *>(&H + 1), 1u << H.NumBits);
+  }
+};
+
+/// Handle for a TrieRawHashMap table.
+///
+/// TrieRawHashMap table layout:
+/// - [8-bytes: Generic table header]
+/// - 1-byte:  NumSubtrieBits
+/// - 1-byte:  Flags (not used yet)
+/// - 2-bytes: NumHashBits
+/// - 4-bytes: RecordDataSize (in bytes)
+/// - 8-bytes: RootTrieOffset
+/// - 8-bytes: AllocatorOffset (reserved for implementing free lists)
+/// - <name> '\0'
+///
+/// Record layout:
+/// - <hash>
+/// - <data>
+class TrieRawHashMapHandle {
+public:
+  static constexpr TableHandle::TableKind Kind =
+      TableHandle::TableKind::TrieRawHashMap;
+
+  struct Header {
+    TableHandle::Header GenericHeader;
+    uint8_t NumSubtrieBits;
+    uint8_t Flags; ///< None used yet.
+    uint16_t NumHashBits;
+    uint32_t RecordDataSize;
+    std::atomic<int64_t> RootTrieOffset;
+    std::atomic<int64_t> AllocatorOffset;
+  };
+
+  operator TableHandle() const {
+    if (!H)
+      return TableHandle();
+    return TableHandle(*Region, H->GenericHeader);
+  }
+
+  struct RecordData {
+    OnDiskTrieRawHashMap::ValueProxy Proxy;
+    SubtrieSlotValue Offset;
+    FileOffset getFileOffset() const { return Offset.asDataFileOffset(); }
+  };
+
+  enum Limits : size_t {
+    /// Seems like 65528 hash bits ought to be enough.
+    MaxNumHashBytes = UINT16_MAX >> 3,
+    MaxNumHashBits = MaxNumHashBytes << 3,
+
+    /// 2^16 bits in a trie is 65536 slots. This restricts us to a 16-bit
+    /// index. This many slots is suspicously large anyway.
+    MaxNumRootBits = 16,
+
+    /// 2^10 bits in a trie is 1024 slots. This many slots seems suspiciously
+    /// large for subtries.
+    MaxNumSubtrieBits = 10,
+  };
+
+  static constexpr size_t getNumHashBytes(size_t NumHashBits) {
+    assert(NumHashBits % 8 == 0);
+    return NumHashBits / 8;
+  }
+  static constexpr size_t getRecordSize(size_t RecordDataSize,
+                                        size_t NumHashBits) {
+    return RecordDataSize + getNumHashBytes(NumHashBits);
+  }
+
+  RecordData getRecord(SubtrieSlotValue Offset);
+  Expected<RecordData> createRecord(MappedFileRegionArena &Alloc,
+                                    ArrayRef<uint8_t> Hash);
+
+  explicit operator bool() const { return H; }
+  const Header &getHeader() const { return *H; }
+  SubtrieHandle getRoot() const;
+  Expected<SubtrieHandle> getOrCreateRoot(MappedFileRegionArena &Alloc);
+  MappedFileRegion &getRegion() const { return *Region; }
+
+  size_t getFlags() const { return H->Flags; }
+  size_t getNumSubtrieBits() const { return H->NumSubtrieBits; }
+  size_t getNumHashBits() const { return H->NumHashBits; }
+  size_t getNumHashBytes() const { return getNumHashBytes(H->NumHashBits); }
+  size_t getRecordDataSize() const { return H->RecordDataSize; }
+  size_t getRecordSize() const {
+    return getRecordSize(H->RecordDataSize, H->NumHashBits);
+  }
+
+  TrieHashIndexGenerator getIndexGen(SubtrieHandle Root,
+                                     ArrayRef<uint8_t> Hash) {
+    assert(Root.getStartBit() == 0);
+    assert(getNumHashBytes() == Hash.size());
+    assert(getNumHashBits() == Hash.size() * 8);
+    return TrieHashIndexGenerator{Root.getNumBits(), getNumSubtrieBits(), Hash};
+  }
+
+  static Expected<TrieRawHashMapHandle>
+  create(MappedFileRegionArena &Alloc, StringRef Name,
+         std::optional<uint64_t> NumRootBits, uint64_t NumSubtrieBits,
+         uint64_t NumHashBits, uint64_t RecordDataSize);
+
+  void
+  print(raw_ostream &OS,
+        function_ref<void(ArrayRef<char>)> PrintRecordData = nullptr) const;
+
+  Error validate(
+      function_ref<Error(FileOffset, OnDiskTrieRawHashMap::ConstValueProxy)>
+          RecordVerifier) const;
+  TrieRawHashMapHandle() = default;
+  TrieRawHashMapHandle(MappedFileRegion &Region, Header &H)
+      : Region(&Region), H(&H) {}
+  TrieRawHashMapHandle(MappedFileRegion &Region, intptr_t HeaderOffset)
+      : TrieRawHashMapHandle(
+            Region, *reinterpret_cast<Header *>(Region.data() + HeaderOffset)) {
+  }
+
+private:
+  MappedFileRegion *Region = nullptr;
+  Header *H = nullptr;
+};
+
+} // end anonymous namespace
+
+struct OnDiskTrieRawHashMap::ImplType {
+  DatabaseFile File;
+  TrieRawHashMapHandle Trie;
+};
+
+Expected<SubtrieHandle> SubtrieHandle::create(MappedFileRegionArena &Alloc,
+                                              uint32_t StartBit,
+                                              uint32_t NumBits) {
+  assert(StartBit <= TrieRawHashMapHandle::MaxNumHashBits);
+  assert(NumBits <= UINT8_MAX);
+  assert(NumBits <= TrieRawHashMapHandle::MaxNumRootBits);
+
+  auto Mem = Alloc.allocate(getSize(NumBits));
+  if (LLVM_UNLIKELY(!Mem))
+    return Mem.takeError();
+  auto *H =
+      new (*Mem) SubtrieHandle::Header{(uint16_t)StartBit, (uint8_t)NumBits,
+                                       /*ZeroPad1B=*/0, /*ZeroPad4B=*/0};
+  SubtrieHandle S(Alloc.getRegion(), *H);
+  for (auto I = S.Slots.begin(), E = S.Slots.end(); I != E; ++I)
+    new (I) SlotT(0);
+  return S;
+}
+
+SubtrieHandle TrieRawHashMapHandle::getRoot() const {
+  if (int64_t Root = H->RootTrieOffset)
+    return SubtrieHandle(getRegion(), SubtrieSlotValue::getSubtrieOffset(Root));
+  return SubtrieHandle();
+}
+
+Expected<SubtrieHandle>
+TrieRawHashMapHandle::getOrCreateRoot(MappedFileRegionArena &Alloc) {
+  assert(&Alloc.getRegion() == &getRegion());
+  if (SubtrieHandle Root = getRoot())
+    return Root;
+
+  int64_t Race = 0;
+  auto LazyRoot = SubtrieHandle::create(Alloc, 0, H->NumSubtrieBits);
+  if (LLVM_UNLIKELY(!LazyRoot))
+    return LazyRoot.takeError();
+  if (H->RootTrieOffset.compare_exchange_strong(
+          Race, LazyRoot->getOffset().asSubtrie()))
+    return *LazyRoot;
+
+  // There was a race. Return the other root.
+  //
+  // TODO: Avoid leaking the lazy root by storing it in an allocator.
+  return SubtrieHandle(getRegion(), SubtrieSlotValue::getSubtrieOffset(Race));
+}
+
+Expected<TrieRawHashMapHandle>
+TrieRawHashMapHandle::create(MappedFileRegionArena &Alloc, StringRef Name,
+                             std::optional<uint64_t> NumRootBits,
+                             uint64_t NumSubtrieBits, uint64_t NumHashBits,
+                             uint64_t RecordDataSize) {
+  // Allocate.
+  auto Offset = Alloc.allocateOffset(sizeof(Header) + Name.size() + 1);
+  if (LLVM_UNLIKELY(!Offset))
+    return Offset.takeError();
+
+  // Construct the header and the name.
+  assert(Name.size() <= UINT16_MAX && "Expected smaller table name");
+  assert(NumSubtrieBits <= UINT8_MAX && "Expected valid subtrie bits");
+  assert(NumHashBits <= UINT16_MAX && "Expected valid hash size");
+  assert(RecordDataSize <= UINT32_MAX && "Expected smaller table name");
+  auto *H = new (Alloc.getRegion().data() + *Offset)
+      Header{{TableHandle::TableKind::TrieRawHashMap, (uint16_t)Name.size(),
+              (uint32_t)sizeof(Header)},
+             (uint8_t)NumSubtrieBits,
+             /*Flags=*/0,
+             (uint16_t)NumHashBits,
+             (uint32_t)RecordDataSize,
+             /*RootTrieOffset=*/{0},
+             /*AllocatorOffset=*/{0}};
+  char *NameStorage = reinterpret_cast<char *>(H + 1);
+  llvm::copy(Name, NameStorage);
+  NameStorage[Name.size()] = 0;
+
+  // Construct a root trie, if requested.
+  TrieRawHashMapHandle Trie(Alloc.getRegion(), *H);
+  auto Sub = SubtrieHandle::create(Alloc, 0, *NumRootBits);
+  if (LLVM_UNLIKELY(!Sub))
+    return Sub.takeError();
+  if (NumRootBits)
+    H->RootTrieOffset = Sub->getOffset().asSubtrie();
+  return Trie;
+}
+
+TrieRawHashMapHandle::RecordData
+TrieRawHashMapHandle::getRecord(SubtrieSlotValue Offset) {
+  char *Begin = Region->data() + Offset.asData();
+  OnDiskTrieRawHashMap::ValueProxy Proxy;
+  Proxy.Data = MutableArrayRef(Begin, getRecordDataSize());
+  Proxy.Hash = ArrayRef(reinterpret_cast<const uint8_t *>(Proxy.Data.end()),
+                        getNumHashBytes());
+  return RecordData{Proxy, Offset};
+}
+
+Expected<TrieRawHashMapHandle::RecordData>
+TrieRawHashMapHandle::createRecord(MappedFileRegionArena &Alloc,
+                                   ArrayRef<uint8_t> Hash) {
+  assert(&Alloc.getRegion() == Region);
+  assert(Hash.size() == getNumHashBytes());
+  auto Offset = Alloc.allocateOffset(getRecordSize());
+  if (LLVM_UNLIKELY(!Offset))
+    return Offset.takeError();
+
+  RecordData Record = getRecord(SubtrieSlotValue::getDataOffset(*Offset));
+  llvm::copy(Hash, const_cast<uint8_t *>(Record.Proxy.Hash.begin()));
+  return Record;
+}
+
+Expected<OnDiskTrieRawHashMap::const_pointer>
+OnDiskTrieRawHashMap::recoverFromFileOffset(FileOffset Offset) const {
+  // Check alignment.
+  if (!isAligned(MappedFileRegionArena::getAlign(), Offset.get()))
+    return createStringError(make_error_code(std::errc::protocol_error),
+                             "unaligned file offset at 0x" +
+                                 utohexstr(Offset.get(), /*LowerCase=*/true));
+
+  // Check bounds.
+  //
+  // Note: There's no potential overflow when using \c uint64_t because Offset
+  // is in valid offset range and the record size is in \c [0,UINT32_MAX].
+  if (!validOffset(Offset) ||
+      Offset.get() + Impl->Trie.getRecordSize() > Impl->File.getAlloc().size())
+    return createStringError(make_error_code(std::errc::protocol_error),
+                             "file offset too large: 0x" +
+                                 utohexstr(Offset.get(), /*LowerCase=*/true));
+
+  // Looks okay...
+  TrieRawHashMapHandle::RecordData D =
+      Impl->Trie.getRecord(SubtrieSlotValue::getDataOffset(Offset));
+  return const_pointer(D.Proxy, D.getFileOffset());
+}
+
+OnDiskTrieRawHashMap::const_pointer
+OnDiskTrieRawHashMap::find(ArrayRef<uint8_t> Hash) const {
+  TrieRawHashMapHandle Trie = Impl->Trie;
+  assert(Hash.size() == Trie.getNumHashBytes() && "Invalid hash");
+
+  SubtrieHandle S = Trie.getRoot();
+  if (!S)
+    return const_pointer();
+
+  TrieHashIndexGenerator IndexGen = Trie.getIndexGen(S, Hash);
+  size_t Index = IndexGen.next();
+  for (;;) {
+    // Try to set the content.
+    SubtrieSlotValue V = S.load(Index);
+    if (!V)
+      return const_pointer();
+
+    // Check for an exact match.
+    if (V.isData()) {
+      TrieRawHashMapHandle::RecordData D = Trie.getRecord(V);
+      return D.Proxy.Hash == Hash ? const_pointer(D.Proxy, D.getFileOffset())
+                                  : const_pointer();
+    }
+
+    Index = IndexGen.next();
+    S = SubtrieHandle(Trie.getRegion(), V);
+  }
+}
+
+/// Only safe if the subtrie is empty.
+void SubtrieHandle::reinitialize(uint32_t StartBit, uint32_t NumBits) {
+  assert(StartBit > H->StartBit);
+  assert(NumBits <= H->NumBits);
+  // Ideally would also assert that all slots are empty, but that's expensive.
+
+  H->StartBit = StartBit;
+  H->NumBits = NumBits;
+}
+
+Expected<OnDiskTrieRawHashMap::pointer>
+OnDiskTrieRawHashMap::insertLazy(ArrayRef<uint8_t> Hash,
+                                 LazyInsertOnConstructCB OnConstruct,
+                                 LazyInsertOnLeakCB OnLeak) {
+  TrieRawHashMapHandle Trie = Impl->Trie;
+  assert(Hash.size() == Trie.getNumHashBytes() && "Invalid hash");
+
+  MappedFileRegionArena &Alloc = Impl->File.getAlloc();
+  std::optional<SubtrieHandle> S;
+  auto Err = Trie.getOrCreateRoot(Alloc).moveInto(S);
+  if (LLVM_UNLIKELY(Err))
+    return std::move(Err);
+
+  TrieHashIndexGenerator IndexGen = Trie.getIndexGen(*S, Hash);
+  size_t Index = IndexGen.next();
+
+  // Walk through the hash bytes and insert into correct trie position.
+  std::optional<TrieRawHashMapHandle::RecordData> NewRecord;
+  SubtrieHandle UnusedSubtrie;
+  for (;;) {
+    SubtrieSlotValue Existing = S->load(Index);
+
+    // Try to set it, if it's empty.
+    if (!Existing) {
+      if (!NewRecord) {
+        auto Err = Trie.createRecord(Alloc, Hash).moveInto(NewRecord);
+        if (LLVM_UNLIKELY(Err))
+          return std::move(Err);
+        if (OnConstruct)
+          OnConstruct(NewRecord->Offset.asDataFileOffset(), NewRecord->Proxy);
+      }
+
+      if (S->compare_exchange_strong(Index, Existing, NewRecord->Offset))
+        return pointer(NewRecord->Proxy, NewRecord->Offset.asDataFileOffset());
+
+      // Race means that Existing is no longer empty; fall through...
+    }
+
+    if (Existing.isSubtrie()) {
+      S = SubtrieHandle(Trie.getRegion(), Existing);
+      Index = IndexGen.next();
+      continue;
+    }
+
+    // Check for an exact match.
+    TrieRawHashMapHandle::RecordData ExistingRecord = Trie.getRecord(Existing);
+    if (ExistingRecord.Proxy.Hash == Hash) {
+      if (NewRecord && OnLeak)
+        OnLeak(NewRecord->Offset.asDataFileOffset(), NewRecord->Proxy,
+               ExistingRecord.Offset.asDataFileOffset(), ExistingRecord.Proxy);
+      return pointer(ExistingRecord.Proxy,
+                     ExistingRecord.Offset.asDataFileOffset());
+    }
+
+    // Sink the existing content as long as the indexes match.
+    for (;;) {
+      size_t NextIndex = IndexGen.next();
+      size_t NewIndexForExistingContent =
+          IndexGen.getCollidingBits(ExistingRecord.Proxy.Hash);
+
+      auto Err = S->sink(Index, Existing, Alloc, IndexGen.getNumBits(),
+                         UnusedSubtrie, NewIndexForExistingContent)
+                     .moveInto(S);
+      if (LLVM_UNLIKELY(Err))
+        return std::move(Err);
+      Index = NextIndex;
+
+      // Found the difference.
+      if (NextIndex != NewIndexForExistingContent)
+        break;
+    }
+  }
+}
+
+Expected<SubtrieHandle> SubtrieHandle::sink(size_t I, SubtrieSlotValue V,
+                                            MappedFileRegionArena &Alloc,
+                                            size_t NumSubtrieBits,
+                                            SubtrieHandle &UnusedSubtrie,
+                                            size_t NewI) {
+  std::optional<SubtrieHandle> NewS;
+  if (UnusedSubtrie) {
+    // Steal UnusedSubtrie and initialize it.
+    NewS.emplace();
+    std::swap(*NewS, UnusedSubtrie);
+    NewS->reinitialize(getStartBit() + getNumBits(), NumSubtrieBits);
+  } else {
+    // Allocate a new, empty subtrie.
+    auto Err = SubtrieHandle::create(Alloc, getStartBit() + getNumBits(),
+                                     NumSubtrieBits)
+                   .moveInto(NewS);
+    if (LLVM_UNLIKELY(Err))
+      return std::move(Err);
+  }
+
+  NewS->store(NewI, V);
+  if (compare_exchange_strong(I, V, NewS->getOffset()))
+    return *NewS; // Success!
+
+  // Raced.
+  assert(V.isSubtrie() && "Expected racing sink() to add a subtrie");
+
+  // Wipe out the new slot so NewS can be reused and set the out parameter.
+  NewS->store(NewI, SubtrieSlotValue());
+  UnusedSubtrie = *NewS;
+
+  // Return the subtrie added by the concurrent sink() call.
+  return SubtrieHandle(Alloc.getRegion(), V);
+}
+
+void OnDiskTrieRawHashMap::print(
+    raw_ostream &OS, function_ref<void(ArrayRef<char>)> PrintRecordData) const {
+  Impl->Trie.print(OS, PrintRecordData);
+}
+
+Error OnDiskTrieRawHashMap::validate(
+    function_ref<Error(FileOffset, ConstValueProxy)> RecordVerifier) const {
+  return Impl->Trie.validate(RecordVerifier);
+}
+
+// Helper function that prints hexdigit and have a sub-byte starting position.
+static void printHexDigits(raw_ostream &OS, ArrayRef<uint8_t> Bytes,
+                           size_t StartBit, size_t NumBits) {
+  assert(StartBit % 4 == 0);
+  assert(NumBits % 4 == 0);
+  for (size_t I = StartBit, E = StartBit + NumBits; I != E; I += 4) {
+    uint8_t HexPair = Bytes[I / 8];
+    uint8_t HexDigit = I % 8 == 0 ? HexPair >> 4 : HexPair & 0xf;
+    OS << hexdigit(HexDigit, /*LowerCase=*/true);
+  }
+}
+
+static void printBits(raw_ostream &OS, ArrayRef<uint8_t> Bytes, size_t StartBit,
+                      size_t NumBits) {
+  assert(StartBit + NumBits <= Bytes.size() * 8u);
+  for (size_t I = StartBit, E = StartBit + NumBits; I != E; ++I) {
+    uint8_t Byte = Bytes[I / 8];
+    size_t ByteOffset = I % 8;
+    if (size_t ByteShift = 8 - ByteOffset - 1)
+      Byte >>= ByteShift;
+    OS << (Byte & 0x1 ? '1' : '0');
+  }
+}
+
+void SubtrieHandle::printHash(raw_ostream &OS, ArrayRef<uint8_t> Bytes) const {
+  // afb[1c:00*01110*0]def
+  size_t EndBit = getStartBit() + getNumBits();
+  size_t HashEndBit = Bytes.size() * 8u;
+
+  size_t FirstBinaryBit = getStartBit() & ~0x3u;
+  printHexDigits(OS, Bytes, 0, FirstBinaryBit);
+
+  size_t LastBinaryBit = (EndBit + 3u) & ~0x3u;
+  OS << "[";
+  printBits(OS, Bytes, FirstBinaryBit, LastBinaryBit - FirstBinaryBit);
+  OS << "]";
+
+  printHexDigits(OS, Bytes, LastBinaryBit, HashEndBit - LastBinaryBit);
+}
+
+static void appendIndexBits(std::string &Prefix, size_t Index,
+                            size_t NumSlots) {
+  std::string Bits;
+  for (size_t NumBits = 1u; NumBits < NumSlots; NumBits <<= 1) {
+    Bits.push_back('0' + (Index & 0x1));
+    Index >>= 1;
+  }
+  for (char Ch : llvm::reverse(Bits))
+    Prefix += Ch;
+}
+
+static void printPrefix(raw_ostream &OS, StringRef Prefix) {
+  while (Prefix.size() >= 4) {
+    uint8_t Digit;
+    bool ErrorParsingBinary = Prefix.take_front(4).getAsInteger(2, Digit);
+    assert(!ErrorParsingBinary);
+    (void)ErrorParsingBinary;
+    OS << hexdigit(Digit, /*LowerCase=*/true);
+    Prefix = Prefix.drop_front(4);
+  }
+  if (!Prefix.empty())
+    OS << "[" << Prefix << "]";
+}
+
+LLVM_DUMP_METHOD void OnDiskTrieRawHashMap::dump() const { print(dbgs()); }
+
+static Expected<size_t> checkParameter(StringRef Label, size_t Max,
+                                       std::optional<size_t> Value,
+                                       std::optional<size_t> Default,
+                                       StringRef Path, StringRef TableName) {
+  assert(Value || Default);
+  assert(!Default || *Default <= Max);
+  if (!Value)
+    return *Default;
+
+  if (*Value <= Max)
+    return *Value;
+  return createTableConfigError(
+      std::errc::argument_out_of_domain, Path, TableName,
+      "invalid " + Label + ": " + Twine(*Value) + " (max: " + Twine(Max) + ")");
+}
+
+size_t OnDiskTrieRawHashMap::size() const { return Impl->File.size(); }
+size_t OnDiskTrieRawHashMap::capacity() const {
+  return Impl->File.getRegion().size();
+}
+
+Expected<OnDiskTrieRawHashMap>
+OnDiskTrieRawHashMap::create(const Twine &PathTwine, const Twine &TrieNameTwine,
+                             size_t NumHashBits, uint64_t DataSize,
+                             uint64_t MaxFileSize,
+                             std::optional<uint64_t> NewFileInitialSize,
+                             std::optional<size_t> NewTableNumRootBits,
+                             std::optional<size_t> NewTableNumSubtrieBits) {
+  SmallString<128> PathStorage;
+  StringRef Path = PathTwine.toStringRef(PathStorage);
+  SmallString<128> TrieNameStorage;
+  StringRef TrieName = TrieNameTwine.toStringRef(TrieNameStorage);
+
+  constexpr size_t DefaultNumRootBits = 10;
+  constexpr size_t DefaultNumSubtrieBits = 6;
+
+  size_t NumRootBits;
+  if (Error E = checkParameter(
+                    "root bits", TrieRawHashMapHandle::MaxNumRootBits,
+                    NewTableNumRootBits, DefaultNumRootBits, Path, TrieName)
+                    .moveInto(NumRootBits))
+    return std::move(E);
+
+  size_t NumSubtrieBits;
+  if (Error E = checkParameter("subtrie bits",
+                               TrieRawHashMapHandle::MaxNumSubtrieBits,
+                               NewTableNumSubtrieBits, DefaultNumSubtrieBits,
+                               Path, TrieName)
+                    .moveInto(NumSubtrieBits))
+    return std::move(E);
+
+  size_t NumHashBytes = NumHashBits >> 3;
+  if (Error E =
+          checkParameter("hash size", TrieRawHashMapHandle::MaxNumHashBits,
+                         NumHashBits, std::nullopt, Path, TrieName)
+              .takeError())
+    return std::move(E);
+  assert(NumHashBits == NumHashBytes << 3 &&
+         "Expected hash size to be byte-aligned");
+  if (NumHashBits != NumHashBytes << 3)
+    return createTableConfigError(
+        std::errc::argument_out_of_domain, Path, TrieName,
+        "invalid hash size: " + Twine(NumHashBits) + " (not byte-aligned)");
+
+  // Constructor for if the file doesn't exist.
+  auto NewDBConstructor = [&](DatabaseFile &DB) -> Error {
+    auto Trie =
+        TrieRawHashMapHandle::create(DB.getAlloc(), TrieName, NumRootBits,
+                                     NumSubtrieBits, NumHashBits, DataSize);
+    if (LLVM_UNLIKELY(!Trie))
+      return Trie.takeError();
+
+    return DB.addTable(*Trie);
+  };
+
+  // Get or create the file.
+  Expected<DatabaseFile> File =
+      DatabaseFile::create(Path, MaxFileSize, NewDBConstructor);
+  if (!File)
+    return File.takeError();
+
+  // Find the trie and validate it.
+  std::optional<TableHandle> Table = File->findTable(TrieName);
+  if (!Table)
+    return createTableConfigError(std::errc::argument_out_of_domain, Path,
+                                  TrieName, "table not found");
+  if (Error E = checkTable("table kind", (size_t)TrieRawHashMapHandle::Kind,
+                           (size_t)Table->getHeader().Kind, Path, TrieName))
+    return std::move(E);
+  auto Trie = Table->cast<TrieRawHashMapHandle>();
+  assert(Trie && "Already checked the kind");
+
+  // Check the hash and data size.
+  if (Error E = checkTable("hash size", NumHashBits, Trie.getNumHashBits(),
+                           Path, TrieName))
+    return std::move(E);
+  if (Error E = checkTable("data size", DataSize, Trie.getRecordDataSize(),
+                           Path, TrieName))
+    return std::move(E);
+
+  // No flags supported right now. Either corrupt, or coming from a future
+  // writer.
+  if (size_t Flags = Trie.getFlags())
+    return createTableConfigError(std::errc::invalid_argument, Path, TrieName,
+                                  "unsupported flags: " + Twine(Flags));
+
+  // Success.
+  OnDiskTrieRawHashMap::ImplType Impl{DatabaseFile(std::move(*File)), Trie};
+  return OnDiskTrieRawHashMap(std::make_unique<ImplType>(std::move(Impl)));
+}
+
+static Error createInvalidTrieError(uint64_t Offset, const Twine &Msg) {
+  return createStringError(make_error_code(std::errc::protocol_error),
+                           "invalid trie at 0x" +
+                               utohexstr(Offset, /*LowerCase=*/true) + ": " +
+                               Msg);
+}
+
+//===----------------------------------------------------------------------===//
+// TrieVisitor data structures.
+//===----------------------------------------------------------------------===//
+
+namespace {
+/// A multi-threaded vistior to traverse the Trie.
+///
+/// TODO: add more sanity checks that isn't just plain data corruption. For
+/// example, some ill-formed data can be constructed to form a cycle using
+/// Sub-Tries and it can lead to inifinite loop when visiting (or inserting
+/// data).
+class TrieVisitor {
+public:
+  TrieVisitor(TrieRawHashMapHandle Trie, unsigned ThreadCount = 0,
+              unsigned ErrorLimit = 50)
+      : Trie(Trie), ErrorLimit(ErrorLimit),
+        Threads(hardware_concurrency(ThreadCount)) {}
+  virtual ~TrieVisitor() = default;
+  Error visit();
+
+private:
+  // Virtual method to implement the action when visiting a sub-trie.
+  virtual Error visitSubTrie(StringRef Prefix, SubtrieHandle SubTrie) {
+    return Error::success();
+  }
+
+  // Virtual method to implement the action when visiting a slot in a trie node.
+  virtual Error visitSlot(unsigned I, SubtrieHandle Subtrie, StringRef Prefix,
+                          SubtrieSlotValue Slot) {
+    return Error::success();
+  }
+
+protected:
+  TrieRawHashMapHandle Trie;
+
+private:
+  Error traverseTrieNode(SubtrieHandle Node, StringRef Prefix);
+
+  Error validateSubTrie(SubtrieHandle Node, bool IsRoot);
+
+  // Helper function to capture errors when visiting the trie nodes.
+  void addError(Error NewError) {
+    assert(NewError && "not an error");
+    std::lock_guard<std::mutex> ErrorLock(Lock);
+    if (NumError >= ErrorLimit) {
+      // Too many errors.
+      consumeError(std::move(NewError));
+      return;
+    }
+
+    if (Err)
+      Err = joinErrors(std::move(*Err), std::move(NewError));
+    else
+      Err = std::move(NewError);
+    NumError++;
+  }
+
+  bool tooManyErrors() {
+    std::lock_guard<std::mutex> ErrorLock(Lock);
+    return (bool)Err && NumError >= ErrorLimit;
+  }
+
+  const unsigned ErrorLimit;
+  std::optional<Error> Err;
+  unsigned NumError = 0;
+  std::mutex Lock;
+  DefaultThreadPool Threads;
+};
+
+/// A visitor that traverse and print the Trie.
+class TriePrinter : public TrieVisitor {
+public:
+  TriePrinter(TrieRawHashMapHandle Trie, raw_ostream &OS,
+              function_ref<void(ArrayRef<char>)> PrintRecordData)
+      : TrieVisitor(Trie, /*ThreadCount=*/1), OS(OS),
+        PrintRecordData(PrintRecordData) {}
+
+  Error printRecords() {
+    if (Records.empty())
+      return Error::success();
+
+    OS << "records\n";
+    llvm::sort(Records);
+    for (int64_t Offset : Records) {
+      TrieRawHashMapHandle::RecordData Record =
+          Trie.getRecord(SubtrieSlotValue::getDataOffset(Offset));
+      if (auto Err = printRecord(Record))
+        return Err;
+    }
+    return Error::success();
+  }
+
+  Error printRecord(TrieRawHashMapHandle::RecordData &Record) {
+    OS << "- addr=" << (void *)Record.getFileOffset().get() << " ";
+    if (PrintRecordData) {
+      PrintRecordData(Record.Proxy.Data);
+    } else {
+      OS << "bytes=";
+      ArrayRef<uint8_t> Data(
+          reinterpret_cast<const uint8_t *>(Record.Proxy.Data.data()),
+          Record.Proxy.Data.size());
+      printHexDigits(OS, Data, 0, Data.size() * 8);
+    }
+    OS << "\n";
+    return Error::success();
+  }
+
+  Error visitSubTrie(StringRef Prefix, SubtrieHandle SubTrie) override {
+    if (Prefix.empty()) {
+      OS << "root";
+    } else {
+      OS << "subtrie=";
+      printPrefix(OS, Prefix);
+    }
+
+    OS << " addr="
+       << (void *)(reinterpret_cast<const char *>(&SubTrie.getHeader()) -
+                   Trie.getRegion().data());
+    OS << " num-slots=" << SubTrie.getNumSlots() << "\n";
+    return Error::success();
+  }
+
+  Error visitSlot(unsigned I, SubtrieHandle Subtrie, StringRef Prefix,
+                  SubtrieSlotValue Slot) override {
+    OS << "- index=";
+    for (size_t Pad : {10, 100, 1000})
+      if (I < Pad && Subtrie.getNumSlots() >= Pad)
+        OS << "0";
+    OS << I << " ";
+    if (Slot.isSubtrie()) {
+      OS << "addr=" << (void *)Slot.asSubtrie();
+      OS << " subtrie=";
+      printPrefix(OS, Prefix);
+      OS << "\n";
+      return Error::success();
+    }
+    TrieRawHashMapHandle::RecordData Record = Trie.getRecord(Slot);
+    OS << "addr=" << (void *)Record.getFileOffset().get();
+    OS << " content=";
+    Subtrie.printHash(OS, Record.Proxy.Hash);
+    OS << "\n";
+    Records.push_back(Slot.asData());
+    return Error::success();
+  }
+
+private:
+  raw_ostream &OS;
+  function_ref<void(ArrayRef<char>)> PrintRecordData;
+  SmallVector<int64_t> Records;
+};
+
+/// TrieVerifier that adds additional verification on top of the basic visitor.
+class TrieVerifier : public TrieVisitor {
+public:
+  TrieVerifier(
+      TrieRawHashMapHandle Trie,
+      function_ref<Error(FileOffset, OnDiskTrieRawHashMap::ConstValueProxy)>
+          RecordVerifier)
+      : TrieVisitor(Trie), RecordVerifier(RecordVerifier) {}
+
+private:
+  Error visitSubTrie(StringRef Prefix, SubtrieHandle SubTrie) final {
+    return Error::success();
+  }
+
+  Error visitSlot(unsigned I, SubtrieHandle Subtrie, StringRef Prefix,
+                  SubtrieSlotValue Slot) final {
+    if (RecordVerifier && Slot.isData()) {
+      if (!isAligned(MappedFileRegionArena::getAlign(), Slot.asData()))
+        return createInvalidTrieError(Slot.asData(), "mis-aligned data entry");
+
+      TrieRawHashMapHandle::RecordData Record =
+          Trie.getRecord(SubtrieSlotValue::getDataOffset(Slot.asData()));
+      return RecordVerifier(Slot.asDataFileOffset(),
+                            OnDiskTrieRawHashMap::ConstValueProxy{
+                                Record.Proxy.Hash, Record.Proxy.Data});
+    }
+    return Error::success();
+  }
+
+  function_ref<Error(FileOffset, OnDiskTrieRawHashMap::ConstValueProxy)>
+      RecordVerifier;
+};
+} // namespace
+
+Error TrieVisitor::visit() {
+  auto Root = Trie.getRoot();
+  if (!Root)
+    return Error::success();
+
+  if (auto Err = validateSubTrie(Root, /*IsRoot=*/true))
+    return Err;
+
+  if (auto Err = visitSubTrie("", Root))
+    return Err;
+
+  SmallVector<SubtrieHandle> Subs;
+  SmallVector<std::string> Prefixes;
+  const size_t NumSlots = Root.getNumSlots();
+  for (size_t I = 0, E = NumSlots; I != E; ++I) {
+    SubtrieSlotValue Slot = Root.load(I);
+    if (!Slot)
+      continue;
+    uint64_t Offset = Slot.isSubtrie() ? Slot.asSubtrie() : Slot.asData();
+    if (Offset >= (uint64_t)Trie.getRegion().size())
+      return createInvalidTrieError(Offset, "slot points out of bound");
+    std::string SubtriePrefix;
+    appendIndexBits(SubtriePrefix, I, NumSlots);
+    if (Slot.isSubtrie()) {
+      SubtrieHandle S(Trie.getRegion(), Slot);
+      Subs.push_back(S);
+      Prefixes.push_back(SubtriePrefix);
+    }
+    if (auto Err = visitSlot(I, Root, SubtriePrefix, Slot))
+      return Err;
+  }
+
+  for (size_t I = 0, E = Subs.size(); I != E; ++I) {
+    Threads.async(
+        [&](unsigned Idx) {
+          // Don't run if there is an error already.
+          if (tooManyErrors())
+            return;
+          if (auto Err = traverseTrieNode(Subs[Idx], Prefixes[Idx]))
+            addError(std::move(Err));
+        },
+        I);
+  }
+
+  Threads.wait();
+  if (Err)
+    return std::move(*Err);
+  return Error::success();
+}
+
+Error TrieVisitor::validateSubTrie(SubtrieHandle Node, bool IsRoot) {
+  char *Addr = reinterpret_cast<char *>(&Node.getHeader());
+  const int64_t Offset = Node.getFileOffset().get();
+  if (Addr + Node.getSize() >=
+      Trie.getRegion().data() + Trie.getRegion().size())
+    return createInvalidTrieError(Offset, "subtrie node spans out of bound");
+
+  if (!IsRoot &&
+      Node.getStartBit() + Node.getNumBits() > Trie.getNumHashBits()) {
+    return createInvalidTrieError(Offset,
+                                  "subtrie represents too many hash bits");
+  }
+
+  if (IsRoot) {
+    if (Node.getStartBit() != 0)
+      return createInvalidTrieError(Offset,
+                                    "root node doesn't start at 0 index");
+
+    return Error::success();
+  }
+
+  if (Node.getNumBits() > Trie.getNumSubtrieBits())
+    return createInvalidTrieError(Offset, "subtrie has wrong number of slots");
+
+  return Error::success();
+}
+
+Error TrieVisitor::traverseTrieNode(SubtrieHandle Node, StringRef Prefix) {
+  if (auto Err = validateSubTrie(Node, /*IsRoot=*/false))
+    return Err;
+
+  if (auto Err = visitSubTrie(Prefix, Node))
+    return Err;
+
+  SmallVector<SubtrieHandle> Subs;
+  SmallVector<std::string> Prefixes;
+  const size_t NumSlots = Node.getNumSlots();
+  for (size_t I = 0, E = NumSlots; I != E; ++I) {
+    SubtrieSlotValue Slot = Node.load(I);
+    if (!Slot)
+      continue;
+    uint64_t Offset = Slot.isSubtrie() ? Slot.asSubtrie() : Slot.asData();
+    if (Offset >= (uint64_t)Trie.getRegion().size())
+      return createInvalidTrieError(Offset, "slot points out of bound");
+    std::string SubtriePrefix = Prefix.str();
+    appendIndexBits(SubtriePrefix, I, NumSlots);
+    if (Slot.isSubtrie()) {
+      SubtrieHandle S(Trie.getRegion(), Slot);
+      Subs.push_back(S);
+      Prefixes.push_back(SubtriePrefix);
+    }
+    if (auto Err = visitSlot(I, Node, SubtriePrefix, Slot))
+      return Err;
+  }
+  for (size_t I = 0, E = Subs.size(); I != E; ++I)
+    if (auto Err = traverseTrieNode(Subs[I], Prefixes[I]))
+      return Err;
+
+  return Error::success();
+}
+
+void TrieRawHashMapHandle::print(
+    raw_ostream &OS, function_ref<void(ArrayRef<char>)> PrintRecordData) const {
+  OS << "hash-num-bits=" << getNumHashBits()
+     << " hash-size=" << getNumHashBytes()
+     << " record-data-size=" << getRecordDataSize() << "\n";
+
+  TriePrinter Printer(*this, OS, PrintRecordData);
+  if (auto Err = Printer.visit())
+    OS << "error: " << toString(std::move(Err)) << "\n";
+
+  if (auto Err = Printer.printRecords())
+    OS << "error: " << toString(std::move(Err)) << "\n";
+
+  return;
+}
+
+Error TrieRawHashMapHandle::validate(
+    function_ref<Error(FileOffset, OnDiskTrieRawHashMap::ConstValueProxy)>
+        RecordVerifier) const {
+  // Use the base TrieVisitor to identify the errors inside trie first.
+  TrieVisitor BasicVerifier(*this);
+  if (auto Err = BasicVerifier.visit())
+    return Err;
+
+  // If the trie data structure is sound, do a second pass to verify data and
+  // verifier function can assume the index is correct. However, there can be
+  // newly added bad entries that can still produce error.
+  TrieVerifier Verifier(*this, RecordVerifier);
+  return Verifier.visit();
+}
+
+#else // !LLVM_ENABLE_ONDISK_CAS
+
+struct OnDiskTrieRawHashMap::ImplType {};
+
+Expected<OnDiskTrieRawHashMap>
+OnDiskTrieRawHashMap::create(const Twine &PathTwine, const Twine &TrieNameTwine,
+                             size_t NumHashBits, uint64_t DataSize,
+                             uint64_t MaxFileSize,
+                             std::optional<uint64_t> NewFileInitialSize,
+                             std::optional<size_t> NewTableNumRootBits,
+                             std::optional<size_t> NewTableNumSubtrieBits) {
+  return createStringError(make_error_code(std::errc::not_supported),
+                           "OnDiskTrieRawHashMap is not supported");
+}
+
+Expected<OnDiskTrieRawHashMap::pointer>
+OnDiskTrieRawHashMap::insertLazy(ArrayRef<uint8_t> Hash,
+                                 LazyInsertOnConstructCB OnConstruct,
+                                 LazyInsertOnLeakCB OnLeak) {
+  return createStringError(make_error_code(std::errc::not_supported),
+                           "OnDiskTrieRawHashMap is not supported");
+}
+
+Expected<OnDiskTrieRawHashMap::const_pointer>
+OnDiskTrieRawHashMap::recoverFromFileOffset(FileOffset Offset) const {
+  return createStringError(make_error_code(std::errc::not_supported),
+                           "OnDiskTrieRawHashMap is not supported");
+}
+
+OnDiskTrieRawHashMap::const_pointer
+OnDiskTrieRawHashMap::find(ArrayRef<uint8_t> Hash) const {
+  return const_pointer();
+}
+
+void OnDiskTrieRawHashMap::print(
+    raw_ostream &OS, function_ref<void(ArrayRef<char>)> PrintRecordData) const {
+}
+
+Error OnDiskTrieRawHashMap::validate(
+    function_ref<Error(FileOffset, OnDiskTrieRawHashMap::ConstValueProxy)>
+        RecordVerifier) const {
+  return createStringError(make_error_code(std::errc::not_supported),
+                           "OnDiskTrieRawHashMap is not supported");
+}
+
+size_t OnDiskTrieRawHashMap::size() const { return 0; }
+size_t OnDiskTrieRawHashMap::capacity() const { return 0; }
+
+#endif // LLVM_ENABLE_ONDISK_CAS
+
+OnDiskTrieRawHashMap::OnDiskTrieRawHashMap(std::unique_ptr<ImplType> Impl)
+    : Impl(std::move(Impl)) {}
+OnDiskTrieRawHashMap::OnDiskTrieRawHashMap(OnDiskTrieRawHashMap &&RHS) =
+    default;
+OnDiskTrieRawHashMap &
+OnDiskTrieRawHashMap::operator=(OnDiskTrieRawHashMap &&RHS) = default;
+OnDiskTrieRawHashMap::~OnDiskTrieRawHashMap() = default;
diff --git a/llvm/lib/CGData/CodeGenDataWriter.cpp b/llvm/lib/CGData/CodeGenDataWriter.cpp
index 14a8558..a2bbcee 100644
--- a/llvm/lib/CGData/CodeGenDataWriter.cpp
+++ b/llvm/lib/CGData/CodeGenDataWriter.cpp
@@ -40,7 +40,7 @@ void CGDataOStream::patch(ArrayRef<CGDataPatchItem> P) {
     for (const auto &K : P) {
       for (size_t I = 0; I < K.D.size(); ++I) {
         uint64_t Bytes =
-            endian::byte_swap<uint64_t, llvm::endianness::little>(K.D[I]);
+            endian::byte_swap<uint64_t>(K.D[I], llvm::endianness::little);
         Data.replace(K.Pos + I * sizeof(uint64_t), sizeof(uint64_t),
                      reinterpret_cast<const char *>(&Bytes), sizeof(uint64_t));
       }
@@ -52,7 +52,7 @@ void CGDataOStream::patch(ArrayRef<CGDataPatchItem> P) {
     for (const auto &K : P) {
       for (size_t I = 0; I < K.D.size(); ++I) {
         uint64_t Bytes =
-            endian::byte_swap<uint64_t, llvm::endianness::little>(K.D[I]);
+            endian::byte_swap<uint64_t>(K.D[I], llvm::endianness::little);
         VOStream.pwrite(reinterpret_cast<const char *>(&Bytes),
                         sizeof(uint64_t), K.Pos + I * sizeof(uint64_t));
       }
diff --git a/llvm/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp b/llvm/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp
index 0f3ff98..d98d180 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp
@@ -105,6 +105,8 @@ DebugHandlerBase::~DebugHandlerBase() = default;
 void DebugHandlerBase::beginModule(Module *M) {
   if (M->debug_compile_units().empty())
     Asm = nullptr;
+  else
+    LScopes.initialize(*M);
 }
 
 // Each LexicalScope has first instruction and last instruction to mark
@@ -269,7 +271,7 @@ void DebugHandlerBase::beginFunction(const MachineFunction *MF) {
 
   // Grab the lexical scopes for the function, if we don't have any of those
   // then we're not going to be able to do anything.
-  LScopes.initialize(*MF);
+  LScopes.scanFunction(*MF);
   if (LScopes.empty()) {
     beginFunctionImpl(MF);
     return;
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
index 67f526f..518121e 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
@@ -537,8 +537,9 @@ void DwarfCompileUnit::addWasmRelocBaseGlobal(DIELoc *Loc, StringRef GlobalName,
 // and DW_AT_high_pc attributes. If there are global variables in this
 // scope then create and insert DIEs for these variables.
 DIE &DwarfCompileUnit::updateSubprogramScopeDIE(const DISubprogram *SP,
+                                                const Function &F,
                                                 MCSymbol *LineTableSym) {
-  DIE *SPDie = getOrCreateSubprogramDIE(SP, includeMinimalInlineScopes());
+  DIE *SPDie = getOrCreateSubprogramDIE(SP, &F, includeMinimalInlineScopes());
   SmallVector<RangeSpan, 2> BB_List;
   // If basic block sections are on, ranges for each basic block section has
   // to be emitted separately.
@@ -1122,9 +1123,10 @@ sortLocalVars(SmallVectorImpl<DbgVariable *> &Input) {
 }
 
 DIE &DwarfCompileUnit::constructSubprogramScopeDIE(const DISubprogram *Sub,
+                                                   const Function &F,
                                                    LexicalScope *Scope,
                                                    MCSymbol *LineTableSym) {
-  DIE &ScopeDIE = updateSubprogramScopeDIE(Sub, LineTableSym);
+  DIE &ScopeDIE = updateSubprogramScopeDIE(Sub, F, LineTableSym);
 
   if (Scope) {
     assert(!Scope->getInlinedAt());
@@ -1198,32 +1200,17 @@ DIE *DwarfCompileUnit::createAndAddScopeChildren(LexicalScope *Scope,
   return ObjectPointer;
 }
 
-void DwarfCompileUnit::constructAbstractSubprogramScopeDIE(
-    LexicalScope *Scope) {
-  auto *SP = cast<DISubprogram>(Scope->getScopeNode());
-  if (getAbstractScopeDIEs().count(SP))
-    return;
+DIE &DwarfCompileUnit::getOrCreateAbstractSubprogramDIE(
+    const DISubprogram *SP) {
+  if (auto *AbsDef = getAbstractScopeDIEs().lookup(SP))
+    return *AbsDef;
 
-  DIE *ContextDIE;
-  DwarfCompileUnit *ContextCU = this;
-
-  if (includeMinimalInlineScopes())
-    ContextDIE = &getUnitDie();
-  // Some of this is duplicated from DwarfUnit::getOrCreateSubprogramDIE, with
-  // the important distinction that the debug node is not associated with the
-  // DIE (since the debug node will be associated with the concrete DIE, if
-  // any). It could be refactored to some common utility function.
-  else if (auto *SPDecl = SP->getDeclaration()) {
-    ContextDIE = &getUnitDie();
-    getOrCreateSubprogramDIE(SPDecl);
-  } else {
-    ContextDIE = getOrCreateContextDIE(SP->getScope());
-    // The scope may be shared with a subprogram that has already been
-    // constructed in another CU, in which case we need to construct this
-    // subprogram in the same CU.
-    ContextCU = DD->lookupCU(ContextDIE->getUnitDie());
-  }
+  auto [ContextDIE, ContextCU] = getOrCreateAbstractSubprogramContextDIE(SP);
+  return createAbstractSubprogramDIE(SP, ContextDIE, ContextCU);
+}
 
+DIE &DwarfCompileUnit::createAbstractSubprogramDIE(
+    const DISubprogram *SP, DIE *ContextDIE, DwarfCompileUnit *ContextCU) {
   // Passing null as the associated node because the abstract definition
   // shouldn't be found by lookup.
   DIE &AbsDef = ContextCU->createAndAddDIE(dwarf::DW_TAG_subprogram,
@@ -1237,8 +1224,45 @@ void DwarfCompileUnit::constructAbstractSubprogramScopeDIE(
                      DD->getDwarfVersion() <= 4 ? std::optional<dwarf::Form>()
                                                 : dwarf::DW_FORM_implicit_const,
                      dwarf::DW_INL_inlined);
-  if (DIE *ObjectPointer = ContextCU->createAndAddScopeChildren(Scope, AbsDef))
-    ContextCU->addDIEEntry(AbsDef, dwarf::DW_AT_object_pointer, *ObjectPointer);
+
+  return AbsDef;
+}
+
+std::pair<DIE *, DwarfCompileUnit *>
+DwarfCompileUnit::getOrCreateAbstractSubprogramContextDIE(
+    const DISubprogram *SP) {
+  bool Minimal = includeMinimalInlineScopes();
+  bool IgnoreScope = shouldPlaceInUnitDIE(SP, Minimal);
+  DIE *ContextDIE = getOrCreateSubprogramContextDIE(SP, IgnoreScope);
+
+  if (auto *SPDecl = SP->getDeclaration())
+    if (!Minimal)
+      getOrCreateSubprogramDIE(SPDecl, nullptr);
+
+  // The scope may be shared with a subprogram that has already been
+  // constructed in another CU, in which case we need to construct this
+  // subprogram in the same CU.
+  auto *ContextCU = IgnoreScope ? this : DD->lookupCU(ContextDIE->getUnitDie());
+
+  return std::make_pair(ContextDIE, ContextCU);
+}
+
+void DwarfCompileUnit::constructAbstractSubprogramScopeDIE(
+    LexicalScope *Scope) {
+  auto *SP = cast<DISubprogram>(Scope->getScopeNode());
+
+  // Populate subprogram DIE only once.
+  if (!getFinalizedAbstractSubprograms().insert(SP).second)
+    return;
+
+  auto [ContextDIE, ContextCU] = getOrCreateAbstractSubprogramContextDIE(SP);
+  DIE *AbsDef = getAbstractScopeDIEs().lookup(SP);
+  if (!AbsDef)
+    AbsDef = &createAbstractSubprogramDIE(SP, ContextDIE, ContextCU);
+
+  if (DIE *ObjectPointer = ContextCU->createAndAddScopeChildren(Scope, *AbsDef))
+    ContextCU->addDIEEntry(*AbsDef, dwarf::DW_AT_object_pointer,
+                           *ObjectPointer);
 }
 
 bool DwarfCompileUnit::useGNUAnalogForDwarf5Feature() const {
@@ -1293,9 +1317,9 @@ DwarfCompileUnit::getDwarf5OrGNULocationAtom(dwarf::LocationAtom Loc) const {
 }
 
 DIE &DwarfCompileUnit::constructCallSiteEntryDIE(
-    DIE &ScopeDIE, const DISubprogram *CalleeSP, bool IsTail,
-    const MCSymbol *PCAddr, const MCSymbol *CallAddr, unsigned CallReg,
-    DIType *AllocSiteTy) {
+    DIE &ScopeDIE, const DISubprogram *CalleeSP, const Function *CalleeF,
+    bool IsTail, const MCSymbol *PCAddr, const MCSymbol *CallAddr,
+    unsigned CallReg, DIType *AllocSiteTy) {
   // Insert a call site entry DIE within ScopeDIE.
   DIE &CallSiteDIE = createAndAddDIE(getDwarf5OrGNUTag(dwarf::DW_TAG_call_site),
                                      ScopeDIE, nullptr);
@@ -1305,7 +1329,7 @@ DIE &DwarfCompileUnit::constructCallSiteEntryDIE(
     addAddress(CallSiteDIE, getDwarf5OrGNUAttr(dwarf::DW_AT_call_target),
                MachineLocation(CallReg));
   } else if (CalleeSP) {
-    DIE *CalleeDIE = getOrCreateSubprogramDIE(CalleeSP);
+    DIE *CalleeDIE = getOrCreateSubprogramDIE(CalleeSP, CalleeF);
     assert(CalleeDIE && "Could not create DIE for call site entry origin");
     if (AddLinkageNamesToDeclCallOriginsForTuning(DD) &&
         !CalleeSP->isDefinition() &&
@@ -1396,7 +1420,7 @@ DIE *DwarfCompileUnit::constructImportedEntityDIE(
     if (auto *AbsSPDie = getAbstractScopeDIEs().lookup(SP))
       EntityDie = AbsSPDie;
     else
-      EntityDie = getOrCreateSubprogramDIE(SP);
+      EntityDie = getOrCreateSubprogramDIE(SP, nullptr);
   } else if (auto *T = dyn_cast<DIType>(Entity))
     EntityDie = getOrCreateTypeDIE(T);
   else if (auto *GV = dyn_cast<DIGlobalVariable>(Entity))
@@ -1805,3 +1829,20 @@ DIE *DwarfCompileUnit::getOrCreateContextDIE(const DIScope *Context) {
   }
   return DwarfUnit::getOrCreateContextDIE(Context);
 }
+
+DIE *DwarfCompileUnit::getOrCreateSubprogramDIE(const DISubprogram *SP,
+                                                const Function *F,
+                                                bool Minimal) {
+  if (!F && SP->isDefinition()) {
+    F = DD->getLexicalScopes().getFunction(SP);
+
+    if (!F) {
+      // SP may belong to another CU. Determine the CU similarly
+      // to DwarfDebug::constructAbstractSubprogramScopeDIE.
+      return &DD->getOrCreateAbstractSubprogramCU(SP, *this)
+                  .getOrCreateAbstractSubprogramDIE(SP);
+    }
+  }
+
+  return DwarfUnit::getOrCreateSubprogramDIE(SP, F, Minimal);
+}
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h b/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h
index c2f6ca0..a3bbc83 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h
@@ -81,6 +81,7 @@ class DwarfCompileUnit final : public DwarfUnit {
 
   // List of abstract local scopes (either DISubprogram or DILexicalBlock).
   DenseMap<const DILocalScope *, DIE *> AbstractLocalScopeDIEs;
+  SmallPtrSet<const DISubprogram *, 8> FinalizedAbstractSubprograms;
 
   // List of inlined lexical block scopes that belong to subprograms within this
   // CU.
@@ -137,12 +138,28 @@ class DwarfCompileUnit final : public DwarfUnit {
     return DU->getAbstractEntities();
   }
 
+  auto &getFinalizedAbstractSubprograms() {
+    if (isDwoUnit() && !DD->shareAcrossDWOCUs())
+      return FinalizedAbstractSubprograms;
+    return DU->getFinalizedAbstractSubprograms();
+  }
+
   void finishNonUnitTypeDIE(DIE& D, const DICompositeType *CTy) override;
 
   /// Add info for Wasm-global-based relocation.
   void addWasmRelocBaseGlobal(DIELoc *Loc, StringRef GlobalName,
                               uint64_t GlobalIndex);
 
+  /// Create context DIE for abstract subprogram.
+  /// \returns The context DIE and the compile unit where abstract
+  ///          DIE should be constructed.
+  std::pair<DIE *, DwarfCompileUnit *>
+  getOrCreateAbstractSubprogramContextDIE(const DISubprogram *SP);
+
+  /// Create new DIE for abstract subprogram.
+  DIE &createAbstractSubprogramDIE(const DISubprogram *SP, DIE *ContextDIE,
+                                   DwarfCompileUnit *ContextCU);
+
 public:
   DwarfCompileUnit(unsigned UID, const DICompileUnit *Node, AsmPrinter *A,
                    DwarfDebug *DW, DwarfFile *DWU,
@@ -216,7 +233,8 @@ public:
   /// DW_AT_low_pc, DW_AT_high_pc and DW_AT_LLVM_stmt_sequence attributes.
   /// If there are global variables in this scope then create and insert DIEs
   /// for these variables.
-  DIE &updateSubprogramScopeDIE(const DISubprogram *SP, MCSymbol *LineTableSym);
+  DIE &updateSubprogramScopeDIE(const DISubprogram *SP, const Function &F,
+                                MCSymbol *LineTableSym);
 
   void constructScopeDIE(LexicalScope *Scope, DIE &ParentScopeDIE);
 
@@ -259,12 +277,18 @@ public:
   /// This instance of 'getOrCreateContextDIE()' can handle DILocalScope.
   DIE *getOrCreateContextDIE(const DIScope *Ty) override;
 
+  DIE *getOrCreateSubprogramDIE(const DISubprogram *SP, const Function *F,
+                                bool Minimal = false) override;
+
   /// Construct a DIE for this subprogram scope.
-  DIE &constructSubprogramScopeDIE(const DISubprogram *Sub, LexicalScope *Scope,
-                                   MCSymbol *LineTableSym);
+  DIE &constructSubprogramScopeDIE(const DISubprogram *Sub, const Function &F,
+                                   LexicalScope *Scope, MCSymbol *LineTableSym);
 
   DIE *createAndAddScopeChildren(LexicalScope *Scope, DIE &ScopeDIE);
 
+  /// Create an abstract subprogram DIE, that should later be populated
+  /// by \ref constructAbstractSubprogramScopeDIE.
+  DIE &getOrCreateAbstractSubprogramDIE(const DISubprogram *SP);
   void constructAbstractSubprogramScopeDIE(LexicalScope *Scope);
 
   /// Whether to use the GNU analog for a DWARF5 tag, attribute, or location
@@ -281,14 +305,15 @@ public:
   dwarf::LocationAtom getDwarf5OrGNULocationAtom(dwarf::LocationAtom Loc) const;
 
   /// Construct a call site entry DIE describing a call within \p Scope to a
-  /// callee described by \p CalleeSP.
+  /// callee described by \p CalleeSP and \p CalleeF.
   /// \p IsTail specifies whether the call is a tail call.
   /// \p PCAddr points to the PC value after the call instruction.
   /// \p CallAddr points to the PC value at the call instruction (or is null).
   /// \p CallReg is a register location for an indirect call. For direct calls
   /// the \p CallReg is set to 0.
   DIE &constructCallSiteEntryDIE(DIE &ScopeDIE, const DISubprogram *CalleeSP,
-                                 bool IsTail, const MCSymbol *PCAddr,
+                                 const Function *CalleeF, bool IsTail,
+                                 const MCSymbol *PCAddr,
                                  const MCSymbol *CallAddr, unsigned CallReg,
                                  DIType *AllocSiteTy);
   /// Construct call site parameter DIEs for the \p CallSiteDIE. The \p Params
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
index 05d3d18..09d5f9c 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
@@ -548,6 +548,16 @@ bool DwarfDebug::shareAcrossDWOCUs() const {
   return SplitDwarfCrossCuReferences;
 }
 
+DwarfCompileUnit &
+DwarfDebug::getOrCreateAbstractSubprogramCU(const DISubprogram *SP,
+                                            DwarfCompileUnit &SrcCU) {
+  auto &CU = getOrCreateDwarfCompileUnit(SP->getUnit());
+  if (CU.getSkeleton())
+    return shareAcrossDWOCUs() ? CU : SrcCU;
+
+  return CU;
+}
+
 void DwarfDebug::constructAbstractSubprogramScopeDIE(DwarfCompileUnit &SrcCU,
                                                      LexicalScope *Scope) {
   assert(Scope && Scope->getScopeNode());
@@ -559,14 +569,11 @@ void DwarfDebug::constructAbstractSubprogramScopeDIE(DwarfCompileUnit &SrcCU,
   // Find the subprogram's DwarfCompileUnit in the SPMap in case the subprogram
   // was inlined from another compile unit.
   auto &CU = getOrCreateDwarfCompileUnit(SP->getUnit());
-  if (auto *SkelCU = CU.getSkeleton()) {
-    (shareAcrossDWOCUs() ? CU : SrcCU)
-        .constructAbstractSubprogramScopeDIE(Scope);
+  auto &TargetCU = getOrCreateAbstractSubprogramCU(SP, SrcCU);
+  TargetCU.constructAbstractSubprogramScopeDIE(Scope);
+  if (auto *SkelCU = CU.getSkeleton())
     if (CU.getCUNode()->getSplitDebugInlining())
       SkelCU->constructAbstractSubprogramScopeDIE(Scope);
-  } else {
-    CU.constructAbstractSubprogramScopeDIE(Scope);
-  }
 }
 
 /// Represents a parameter whose call site value can be described by applying a
@@ -997,8 +1004,9 @@ void DwarfDebug::constructCallSiteEntryDIEs(const DISubprogram &SP,
                                                        ->getName(CallReg)))
                         << (IsTail ? " [IsTail]" : "") << "\n");
 
-      DIE &CallSiteDIE = CU.constructCallSiteEntryDIE(
-          ScopeDIE, CalleeSP, IsTail, PCAddr, CallAddr, CallReg, AllocSiteTy);
+      DIE &CallSiteDIE =
+          CU.constructCallSiteEntryDIE(ScopeDIE, CalleeSP, CalleeDecl, IsTail,
+                                       PCAddr, CallAddr, CallReg, AllocSiteTy);
 
       // Optionally emit call-site-param debug info.
       if (emitDebugEntryValues()) {
@@ -2707,7 +2715,8 @@ void DwarfDebug::skippedNonDebugFunction() {
 
 // Gather and emit post-function debug information.
 void DwarfDebug::endFunctionImpl(const MachineFunction *MF) {
-  const DISubprogram *SP = MF->getFunction().getSubprogram();
+  const Function &F = MF->getFunction();
+  const DISubprogram *SP = F.getSubprogram();
 
   assert(CurFn == MF &&
       "endFunction should be called with the same function as beginFunction");
@@ -2776,11 +2785,12 @@ void DwarfDebug::endFunctionImpl(const MachineFunction *MF) {
 
   ProcessedSPNodes.insert(SP);
   DIE &ScopeDIE =
-      TheCU.constructSubprogramScopeDIE(SP, FnScope, FunctionLineTableLabel);
+      TheCU.constructSubprogramScopeDIE(SP, F, FnScope, FunctionLineTableLabel);
   if (auto *SkelCU = TheCU.getSkeleton())
     if (!LScopes.getAbstractScopesList().empty() &&
         TheCU.getCUNode()->getSplitDebugInlining())
-      SkelCU->constructSubprogramScopeDIE(SP, FnScope, FunctionLineTableLabel);
+      SkelCU->constructSubprogramScopeDIE(SP, F, FnScope,
+                                          FunctionLineTableLabel);
 
   FunctionLineTableLabel = nullptr;
 
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h
index 89813dc..1a1b28a 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h
@@ -906,6 +906,10 @@ public:
     return CUDieMap.lookup(Die);
   }
 
+  /// Find the matching DwarfCompileUnit for the given SP referenced from SrcCU.
+  DwarfCompileUnit &getOrCreateAbstractSubprogramCU(const DISubprogram *SP,
+                                                    DwarfCompileUnit &SrcCU);
+
   unsigned getStringTypeLoc(const DIStringType *ST) const {
     return StringTypeLocMap.lookup(ST);
   }
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfFile.h b/llvm/lib/CodeGen/AsmPrinter/DwarfFile.h
index 0fc2b91..ef1524d 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfFile.h
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfFile.h
@@ -11,6 +11,7 @@
 
 #include "DwarfStringPool.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/CodeGen/DIE.h"
@@ -27,6 +28,7 @@ class DbgVariable;
 class DbgLabel;
 class DINode;
 class DILocalScope;
+class DISubprogram;
 class DwarfCompileUnit;
 class DwarfUnit;
 class LexicalScope;
@@ -94,6 +96,9 @@ class DwarfFile {
   // Collection of abstract subprogram DIEs.
   DenseMap<const DILocalScope *, DIE *> AbstractLocalScopeDIEs;
   DenseMap<const DINode *, std::unique_ptr<DbgEntity>> AbstractEntities;
+  /// Keeps track of abstract subprograms to populate them only once.
+  // FIXME: merge creation and population of abstract scopes.
+  SmallPtrSet<const DISubprogram *, 8> FinalizedAbstractSubprograms;
 
   /// Maps MDNodes for type system with the corresponding DIEs. These DIEs can
   /// be shared across CUs, that is why we keep the map here instead
@@ -174,6 +179,10 @@ public:
     return AbstractEntities;
   }
 
+  auto &getFinalizedAbstractSubprograms() {
+    return FinalizedAbstractSubprograms;
+  }
+
   void insertDIE(const MDNode *TypeMD, DIE *Die) {
     DITypeNodeToDieMap.insert(std::make_pair(TypeMD, Die));
   }
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
index d76fd0c..62fb5eb 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
@@ -573,7 +573,7 @@ DIE *DwarfUnit::getOrCreateContextDIE(const DIScope *Context) {
   if (auto *NS = dyn_cast<DINamespace>(Context))
     return getOrCreateNameSpace(NS);
   if (auto *SP = dyn_cast<DISubprogram>(Context))
-    return getOrCreateSubprogramDIE(SP);
+    return getOrCreateSubprogramDIE(SP, nullptr);
   if (auto *M = dyn_cast<DIModule>(Context))
     return getOrCreateModule(M);
   return getDIE(Context);
@@ -1066,7 +1066,7 @@ void DwarfUnit::constructTypeDIE(DIE &Buffer, const DICompositeType *CTy) {
       if (!Element)
         continue;
       if (auto *SP = dyn_cast<DISubprogram>(Element))
-        getOrCreateSubprogramDIE(SP);
+        getOrCreateSubprogramDIE(SP, nullptr);
       else if (auto *DDTy = dyn_cast<DIDerivedType>(Element)) {
         if (DDTy->getTag() == dwarf::DW_TAG_friend) {
           DIE &ElemDie = createAndAddDIE(dwarf::DW_TAG_friend, Buffer);
@@ -1335,22 +1335,21 @@ DIE *DwarfUnit::getOrCreateModule(const DIModule *M) {
   return &MDie;
 }
 
-DIE *DwarfUnit::getOrCreateSubprogramDIE(const DISubprogram *SP, bool Minimal) {
+DIE *DwarfUnit::getOrCreateSubprogramDIE(const DISubprogram *SP,
+                                         const Function *FnHint, bool Minimal) {
   // Construct the context before querying for the existence of the DIE in case
   // such construction creates the DIE (as is the case for member function
   // declarations).
   DIE *ContextDIE =
-      Minimal ? &getUnitDie() : getOrCreateContextDIE(SP->getScope());
+      getOrCreateSubprogramContextDIE(SP, shouldPlaceInUnitDIE(SP, Minimal));
 
   if (DIE *SPDie = getDIE(SP))
     return SPDie;
 
   if (auto *SPDecl = SP->getDeclaration()) {
     if (!Minimal) {
-      // Add subprogram definitions to the CU die directly.
-      ContextDIE = &getUnitDie();
       // Build the decl now to ensure it precedes the definition.
-      getOrCreateSubprogramDIE(SPDecl);
+      getOrCreateSubprogramDIE(SPDecl, nullptr);
       // Check whether the DIE for SP has already been created after the call
       // above.
       // FIXME: Should the creation of definition subprogram DIE during
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h
index fe05766..bb00ec3 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h
@@ -256,7 +256,9 @@ public:
 
   DIE *getOrCreateNameSpace(const DINamespace *NS);
   DIE *getOrCreateModule(const DIModule *M);
-  DIE *getOrCreateSubprogramDIE(const DISubprogram *SP, bool Minimal = false);
+  virtual DIE *getOrCreateSubprogramDIE(const DISubprogram *SP,
+                                        const Function *FnHint,
+                                        bool Minimal = false);
 
   void applySubprogramAttributes(const DISubprogram *SP, DIE &SPDie,
                                  bool SkipSPAttributes = false);
@@ -343,6 +345,18 @@ protected:
   /// Emit the common part of the header for this unit.
   void emitCommonHeader(bool UseOffsets, dwarf::UnitType UT);
 
+  bool shouldPlaceInUnitDIE(const DISubprogram *SP, bool Minimal) {
+    // Add subprogram declarations to the CU die directly.
+    return Minimal || SP->getDeclaration();
+  }
+
+  DIE *getOrCreateSubprogramContextDIE(const DISubprogram *SP,
+                                       bool IgnoreScope) {
+    if (IgnoreScope)
+      return &getUnitDie();
+    return getOrCreateContextDIE(SP->getScope());
+  }
+
 private:
   /// A helper to add a wide integer constant to a DIE using a block
   /// form.
diff --git a/llvm/lib/CodeGen/LexicalScopes.cpp b/llvm/lib/CodeGen/LexicalScopes.cpp
index 5916f61..9fc9ac9 100644
--- a/llvm/lib/CodeGen/LexicalScopes.cpp
+++ b/llvm/lib/CodeGen/LexicalScopes.cpp
@@ -23,6 +23,7 @@
 #include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
@@ -36,8 +37,16 @@ using namespace llvm;
 
 #define DEBUG_TYPE "lexicalscopes"
 
-/// reset - Reset the instance so that it's prepared for another function.
-void LexicalScopes::reset() {
+static bool skipUnit(const DICompileUnit *CU) {
+  return CU->getEmissionKind() == DICompileUnit::NoDebug;
+}
+
+void LexicalScopes::resetModule() {
+  FunctionMap.clear();
+  resetFunction();
+}
+
+void LexicalScopes::resetFunction() {
   MF = nullptr;
   CurrentFnLexicalScope = nullptr;
   LexicalScopeMap.clear();
@@ -47,12 +56,19 @@ void LexicalScopes::reset() {
   DominatedBlocks.clear();
 }
 
-/// initialize - Scan machine function and constuct lexical scope nest.
-void LexicalScopes::initialize(const MachineFunction &Fn) {
-  reset();
+void LexicalScopes::initialize(const Module &M) {
+  resetModule();
+  for (const Function &F : M) {
+    DISubprogram *SP = F.getSubprogram();
+    if (SP && (!SP->getUnit() || !skipUnit(SP->getUnit())))
+      FunctionMap[SP] = &F;
+  }
+}
+
+void LexicalScopes::scanFunction(const MachineFunction &Fn) {
+  resetFunction();
   // Don't attempt any lexical scope creation for a NoDebug compile unit.
-  if (Fn.getFunction().getSubprogram()->getUnit()->getEmissionKind() ==
-      DICompileUnit::NoDebug)
+  if (skipUnit(Fn.getFunction().getSubprogram()->getUnit()))
     return;
   MF = &Fn;
   SmallVector<InsnRange, 4> MIRanges;
@@ -143,8 +159,7 @@ LexicalScope *LexicalScopes::getOrCreateLexicalScope(const DILocalScope *Scope,
                                                      const DILocation *IA) {
   if (IA) {
     // Skip scopes inlined from a NoDebug compile unit.
-    if (Scope->getSubprogram()->getUnit()->getEmissionKind() ==
-        DICompileUnit::NoDebug)
+    if (skipUnit(Scope->getSubprogram()->getUnit()))
       return getOrCreateLexicalScope(IA);
     // Create an abstract scope for inlined function.
     getOrCreateAbstractScope(Scope);
diff --git a/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp b/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp
index a8143bd..0037bdd 100644
--- a/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp
+++ b/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp
@@ -3721,7 +3721,7 @@ bool InstrRefBasedLDV::ExtendRanges(MachineFunction &MF,
   TFI = MF.getSubtarget().getFrameLowering();
   TFI->getCalleeSaves(MF, CalleeSavedRegs);
   MFI = &MF.getFrameInfo();
-  LS.initialize(MF);
+  LS.scanFunction(MF);
 
   const auto &STI = MF.getSubtarget();
   AdjustsStackInCalls = MFI->adjustsStack() &&
diff --git a/llvm/lib/CodeGen/LiveDebugValues/VarLocBasedImpl.cpp b/llvm/lib/CodeGen/LiveDebugValues/VarLocBasedImpl.cpp
index 82e0c28..b9ea03f 100644
--- a/llvm/lib/CodeGen/LiveDebugValues/VarLocBasedImpl.cpp
+++ b/llvm/lib/CodeGen/LiveDebugValues/VarLocBasedImpl.cpp
@@ -2231,7 +2231,7 @@ bool VarLocBasedLDV::ExtendRanges(MachineFunction &MF,
   TFI->getCalleeSaves(MF, CalleeSavedRegs);
   this->ShouldEmitDebugEntryValues = ShouldEmitDebugEntryValues;
 
-  LS.initialize(MF);
+  LS.scanFunction(MF);
 
   bool Changed = false;
   bool OLChanged = false;
diff --git a/llvm/lib/CodeGen/LiveDebugVariables.cpp b/llvm/lib/CodeGen/LiveDebugVariables.cpp
index 84e9f03..001ba52 100644
--- a/llvm/lib/CodeGen/LiveDebugVariables.cpp
+++ b/llvm/lib/CodeGen/LiveDebugVariables.cpp
@@ -1263,7 +1263,7 @@ void UserValue::computeIntervals(MachineRegisterInfo &MRI,
 
 void LiveDebugVariables::LDVImpl::computeIntervals() {
   LexicalScopes LS;
-  LS.initialize(*MF);
+  LS.scanFunction(*MF);
 
   for (const auto &UV : userValues) {
     UV->computeIntervals(MF->getRegInfo(), *TRI, *LIS, LS);
diff --git a/llvm/lib/CodeGen/MachineVerifier.cpp b/llvm/lib/CodeGen/MachineVerifier.cpp
index e911ce8..1154855 100644
--- a/llvm/lib/CodeGen/MachineVerifier.cpp
+++ b/llvm/lib/CodeGen/MachineVerifier.cpp
@@ -1549,7 +1549,7 @@ void MachineVerifier::verifyPreISelGenericInstruction(const MachineInstr *MI) {
       report("G_BUILD_VECTOR result element type must match source type", MI);
 
     if (DstTy.getNumElements() != MI->getNumOperands() - 1)
-      report("G_BUILD_VECTOR must have an operand for each elemement", MI);
+      report("G_BUILD_VECTOR must have an operand for each element", MI);
 
     for (const MachineOperand &MO : llvm::drop_begin(MI->operands(), 2))
       if (MRI->getType(MI->getOperand(1).getReg()) != MRI->getType(MO.getReg()))
@@ -2398,11 +2398,11 @@ void MachineVerifier::visitMachineInstrBefore(const MachineInstr *MI) {
 
     // The next two checks allow COPY between physical and virtual registers,
     // when the virtual register has a scalable size and the physical register
-    // has a fixed size. These checks allow COPY between *potentialy* mismatched
-    // sizes. However, once RegisterBankSelection occurs, MachineVerifier should
-    // be able to resolve a fixed size for the scalable vector, and at that
-    // point this function will know for sure whether the sizes are mismatched
-    // and correctly report a size mismatch.
+    // has a fixed size. These checks allow COPY between *potentially*
+    // mismatched sizes. However, once RegisterBankSelection occurs,
+    // MachineVerifier should be able to resolve a fixed size for the scalable
+    // vector, and at that point this function will know for sure whether the
+    // sizes are mismatched and correctly report a size mismatch.
     if (SrcReg.isPhysical() && DstReg.isVirtual() && DstSize.isScalable() &&
         !SrcSize.isScalable())
       break;
@@ -3213,13 +3213,13 @@ struct VRegFilter {
 
 private:
   static constexpr unsigned SparseUniverseMax = 10 * 1024 * 8;
-  // VRegs indexed within SparseUniverseMax are tracked by Sparse, those beyound
-  // are tracked by Dense. The only purpose of the threashold and the Dense set
+  // VRegs indexed within SparseUniverseMax are tracked by Sparse, those beyond
+  // are tracked by Dense. The only purpose of the threshold and the Dense set
   // is to have a reasonably growing memory usage in pathological cases (large
   // number of very sparse VRegFilter instances live at the same time). In
   // practice even in the worst-by-execution time cases having all elements
   // tracked by Sparse (very large SparseUniverseMax scenario) tends to be more
-  // space efficient than if tracked by Dense. The threashold is set to keep the
+  // space efficient than if tracked by Dense. The threshold is set to keep the
   // worst-case memory usage within 2x of figures determined empirically for
   // "all Dense" scenario in such worst-by-execution-time cases.
   BitVector Sparse;
@@ -3459,7 +3459,7 @@ void MachineVerifier::visitMachineFunctionAfter() {
 
   // Check live-in list of each MBB. If a register is live into MBB, check
   // that the register is in regsLiveOut of each predecessor block. Since
-  // this must come from a definition in the predecesssor or its live-in
+  // this must come from a definition in the predecessor or its live-in
   // list, this will catch a live-through case where the predecessor does not
   // have the register in its live-in list.  This currently only checks
   // registers that have no aliases, are not allocatable and are not
diff --git a/llvm/lib/CodeGen/RDFLiveness.cpp b/llvm/lib/CodeGen/RDFLiveness.cpp
index 318422b..2e1cf49 100644
--- a/llvm/lib/CodeGen/RDFLiveness.cpp
+++ b/llvm/lib/CodeGen/RDFLiveness.cpp
@@ -652,8 +652,9 @@ void Liveness::computePhiInfo() {
   // defs, cache the result of subtracting these defs from a given register
   // ref.
   using RefHash = std::hash<RegisterRef>;
-  using RefEqual = std::equal_to<RegisterRef>;
-  using SubMap = std::unordered_map<RegisterRef, RegisterRef>;
+  using RefEqual = RegisterRefEqualTo;
+  using SubMap =
+      std::unordered_map<RegisterRef, RegisterRef, RefHash, RefEqual>;
   std::unordered_map<RegisterAggr, SubMap> Subs;
   auto ClearIn = [](RegisterRef RR, const RegisterAggr &Mid, SubMap &SM) {
     if (Mid.empty())
@@ -868,7 +869,7 @@ void Liveness::computeLiveIns() {
       std::vector<RegisterRef> LV;
       for (const MachineBasicBlock::RegisterMaskPair &LI : B.liveins())
         LV.push_back(RegisterRef(LI.PhysReg, LI.LaneMask));
-      llvm::sort(LV, std::less<RegisterRef>(PRI));
+      llvm::sort(LV, RegisterRefLess(PRI));
       dbgs() << printMBBReference(B) << "\t rec = {";
       for (auto I : LV)
         dbgs() << ' ' << Print(I, DFG);
@@ -878,7 +879,7 @@ void Liveness::computeLiveIns() {
       LV.clear();
       for (RegisterRef RR : LiveMap[&B].refs())
         LV.push_back(RR);
-      llvm::sort(LV, std::less<RegisterRef>(PRI));
+      llvm::sort(LV, RegisterRefLess(PRI));
       dbgs() << "\tcomp = {";
       for (auto I : LV)
         dbgs() << ' ' << Print(I, DFG);
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index c815686..77df4b4 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -17983,8 +17983,7 @@ SDValue DAGCombiner::visitFSUB(SDNode *N) {
 
   // (fsub A, 0) -> A
   if (N1CFP && N1CFP->isZero()) {
-    if (!N1CFP->isNegative() || Options.NoSignedZerosFPMath ||
-        Flags.hasNoSignedZeros()) {
+    if (!N1CFP->isNegative() || Flags.hasNoSignedZeros()) {
       return N0;
     }
   }
@@ -17997,8 +17996,7 @@ SDValue DAGCombiner::visitFSUB(SDNode *N) {
 
   // (fsub -0.0, N1) -> -N1
   if (N0CFP && N0CFP->isZero()) {
-    if (N0CFP->isNegative() ||
-        (Options.NoSignedZerosFPMath || Flags.hasNoSignedZeros())) {
+    if (N0CFP->isNegative() || Flags.hasNoSignedZeros()) {
       // We cannot replace an FSUB(+-0.0,X) with FNEG(X) when denormals are
       // flushed to zero, unless all users treat denorms as zero (DAZ).
       // FIXME: This transform will change the sign of a NaN and the behavior
@@ -18014,8 +18012,7 @@ SDValue DAGCombiner::visitFSUB(SDNode *N) {
     }
   }
 
-  if ((Options.NoSignedZerosFPMath ||
-       (Flags.hasAllowReassociation() && Flags.hasNoSignedZeros())) &&
+  if (Flags.hasAllowReassociation() && Flags.hasNoSignedZeros() &&
       N1.getOpcode() == ISD::FADD) {
     // X - (X + Y) -> -Y
     if (N0 == N1->getOperand(0))
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index dba5a8c..cc503d3 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -7492,7 +7492,6 @@ SDValue TargetLowering::getNegatedExpression(SDValue Op, SelectionDAG &DAG,
   // Pre-increment recursion depth for use in recursive calls.
   ++Depth;
   const SDNodeFlags Flags = Op->getFlags();
-  const TargetOptions &Options = DAG.getTarget().Options;
   EVT VT = Op.getValueType();
   unsigned Opcode = Op.getOpcode();
 
@@ -7572,7 +7571,7 @@ SDValue TargetLowering::getNegatedExpression(SDValue Op, SelectionDAG &DAG,
     return DAG.getBuildVector(VT, DL, Ops);
   }
   case ISD::FADD: {
-    if (!Options.NoSignedZerosFPMath && !Flags.hasNoSignedZeros())
+    if (!Flags.hasNoSignedZeros())
       break;
 
     // After operation legalization, it might not be legal to create new FSUBs.
@@ -7617,7 +7616,7 @@ SDValue TargetLowering::getNegatedExpression(SDValue Op, SelectionDAG &DAG,
   }
   case ISD::FSUB: {
     // We can't turn -(A-B) into B-A when we honor signed zeros.
-    if (!Options.NoSignedZerosFPMath && !Flags.hasNoSignedZeros())
+    if (!Flags.hasNoSignedZeros())
       break;
 
     SDValue X = Op.getOperand(0), Y = Op.getOperand(1);
@@ -7678,7 +7677,7 @@ SDValue TargetLowering::getNegatedExpression(SDValue Op, SelectionDAG &DAG,
   }
   case ISD::FMA:
   case ISD::FMAD: {
-    if (!Options.NoSignedZerosFPMath && !Flags.hasNoSignedZeros())
+    if (!Flags.hasNoSignedZeros())
       break;
 
     SDValue X = Op.getOperand(0), Y = Op.getOperand(1), Z = Op.getOperand(2);
@@ -8797,7 +8796,6 @@ SDValue TargetLowering::expandFMINIMUMNUM_FMAXIMUMNUM(SDNode *Node,
   EVT VT = Node->getValueType(0);
   EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
   bool IsMax = Opc == ISD::FMAXIMUMNUM;
-  const TargetOptions &Options = DAG.getTarget().Options;
   SDNodeFlags Flags = Node->getFlags();
 
   unsigned NewOp =
@@ -8858,8 +8856,8 @@ SDValue TargetLowering::expandFMINIMUMNUM_FMAXIMUMNUM(SDNode *Node,
   // TODO: We need quiet sNaN if strictfp.
 
   // Fixup signed zero behavior.
-  if (Options.NoSignedZerosFPMath || Flags.hasNoSignedZeros() ||
-      DAG.isKnownNeverZeroFloat(LHS) || DAG.isKnownNeverZeroFloat(RHS)) {
+  if (Flags.hasNoSignedZeros() || DAG.isKnownNeverZeroFloat(LHS) ||
+      DAG.isKnownNeverZeroFloat(RHS)) {
     return MinMax;
   }
   SDValue TestZero =
diff --git a/llvm/lib/IR/Instructions.cpp b/llvm/lib/IR/Instructions.cpp
index daebf447..dd83168 100644
--- a/llvm/lib/IR/Instructions.cpp
+++ b/llvm/lib/IR/Instructions.cpp
@@ -2847,6 +2847,7 @@ unsigned CastInst::isEliminableCastPair(
   // FPTRUNC       >       FloatPt      n/a        FloatPt      n/a
   // FPEXT         <       FloatPt      n/a        FloatPt      n/a
   // PTRTOINT     n/a      Pointer      n/a        Integral   Unsigned
+  // PTRTOADDR    n/a      Pointer      n/a        Integral   Unsigned
   // INTTOPTR     n/a      Integral   Unsigned     Pointer      n/a
   // BITCAST       =       FirstClass   n/a       FirstClass    n/a
   // ADDRSPCST    n/a      Pointer      n/a        Pointer      n/a
@@ -2878,7 +2879,7 @@ unsigned CastInst::isEliminableCastPair(
     { 99,99,99, 2, 2,99,99, 8, 2,99,99,99, 4, 0}, // FPExt          |
     {  1, 0, 0,99,99, 0, 0,99,99,99,99, 7, 3, 0}, // PtrToInt       |
     {  1, 0, 0,99,99, 0, 0,99,99,99,99, 0, 3, 0}, // PtrToAddr      |
-    { 99,99,99,99,99,99,99,99,99,11,99,99,15, 0}, // IntToPtr       |
+    { 99,99,99,99,99,99,99,99,99,11,11,99,15, 0}, // IntToPtr       |
     {  5, 5, 5, 0, 0, 5, 5, 0, 0,16,16, 5, 1,14}, // BitCast        |
     {  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,13,12}, // AddrSpaceCast -+
   };
@@ -2972,7 +2973,8 @@ unsigned CastInst::isEliminableCastPair(
       // zext, sext -> zext, because sext can't sign extend after zext
       return Instruction::ZExt;
     case 11: {
-      // inttoptr, ptrtoint -> bitcast if SrcSize<=PtrSize and SrcSize==DstSize
+      // inttoptr, ptrtoint/ptrtoaddr -> bitcast if SrcSize<=PtrSize and
+      // SrcSize==DstSize
       if (!MidIntPtrTy)
         return 0;
       unsigned PtrSize = MidIntPtrTy->getScalarSizeInBits();
diff --git a/llvm/lib/IR/ProfDataUtils.cpp b/llvm/lib/IR/ProfDataUtils.cpp
index 5827292..99029c1 100644
--- a/llvm/lib/IR/ProfDataUtils.cpp
+++ b/llvm/lib/IR/ProfDataUtils.cpp
@@ -252,6 +252,13 @@ void setExplicitlyUnknownBranchWeights(Instruction &I, StringRef PassName) {
                    MDB.createString(PassName)}));
 }
 
+void setExplicitlyUnknownBranchWeightsIfProfiled(Instruction &I, Function &F,
+                                                 StringRef PassName) {
+  if (std::optional<Function::ProfileCount> EC = F.getEntryCount();
+      EC && EC->getCount() > 0)
+    setExplicitlyUnknownBranchWeights(I, PassName);
+}
+
 void setExplicitlyUnknownFunctionEntryCount(Function &F, StringRef PassName) {
   MDBuilder MDB(F.getContext());
   F.setMetadata(
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index b2e76cc..8c03d6f 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -5869,9 +5869,7 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
     break;
   }
   case Intrinsic::call_preallocated_setup: {
-    auto *NumArgs = dyn_cast<ConstantInt>(Call.getArgOperand(0));
-    Check(NumArgs != nullptr,
-          "llvm.call.preallocated.setup argument must be a constant");
+    auto *NumArgs = cast<ConstantInt>(Call.getArgOperand(0));
     bool FoundCall = false;
     for (User *U : Call.users()) {
       auto *UseCall = dyn_cast<CallBase>(U);
diff --git a/llvm/lib/MC/DXContainerRootSignature.cpp b/llvm/lib/MC/DXContainerRootSignature.cpp
index 2338370..713aa3d8 100644
--- a/llvm/lib/MC/DXContainerRootSignature.cpp
+++ b/llvm/lib/MC/DXContainerRootSignature.cpp
@@ -23,9 +23,8 @@ static uint32_t writePlaceholder(raw_svector_ostream &Stream) {
 static uint32_t rewriteOffsetToCurrentByte(raw_svector_ostream &Stream,
                                            uint32_t Offset) {
   uint32_t ByteOffset = Stream.tell();
-  uint32_t Value =
-      support::endian::byte_swap<uint32_t, llvm::endianness::little>(
-          ByteOffset);
+  uint32_t Value = support::endian::byte_swap<uint32_t>(
+      ByteOffset, llvm::endianness::little);
   Stream.pwrite(reinterpret_cast<const char *>(&Value), sizeof(Value), Offset);
   return ByteOffset;
 }
diff --git a/llvm/lib/Object/ArchiveWriter.cpp b/llvm/lib/Object/ArchiveWriter.cpp
index 6fc0889..a112597 100644
--- a/llvm/lib/Object/ArchiveWriter.cpp
+++ b/llvm/lib/Object/ArchiveWriter.cpp
@@ -1119,10 +1119,26 @@ Error writeArchiveToStream(raw_ostream &Out,
     // to switch to 64-bit. Note that the file can be larger than 4GB as long as
     // the last member starts before the 4GB offset.
     if (*HeadersSize + LastMemberHeaderOffset >= Sym64Threshold) {
-      if (Kind == object::Archive::K_DARWIN)
+      switch (Kind) {
+      case object::Archive::K_COFF:
+        // COFF format has no 64-bit version, so we use GNU64 instead.
+        if (!SymMap.Map.empty() && !SymMap.ECMap.empty())
+          // Only the COFF format supports the ECSYMBOLS section, so don’t use
+          // GNU64 when two symbol maps are required.
+          return make_error<object::GenericBinaryError>(
+              "Archive is too large: ARM64X does not support archives larger "
+              "than 4GB");
+        // Since this changes the headers, we need to recalculate everything.
+        return writeArchiveToStream(Out, NewMembers, WriteSymtab,
+                                    object::Archive::K_GNU64, Deterministic,
+                                    Thin, IsEC, Warn);
+      case object::Archive::K_DARWIN:
         Kind = object::Archive::K_DARWIN64;
-      else
+        break;
+      default:
         Kind = object::Archive::K_GNU64;
+        break;
+      }
       HeadersSize.reset();
     }
   }
diff --git a/llvm/lib/Object/OffloadBundle.cpp b/llvm/lib/Object/OffloadBundle.cpp
index 1e1042c..0dd378e 100644
--- a/llvm/lib/Object/OffloadBundle.cpp
+++ b/llvm/lib/Object/OffloadBundle.cpp
@@ -89,17 +89,17 @@ Error OffloadBundleFatBin::readEntries(StringRef Buffer,
     uint64_t EntryIDSize;
     StringRef EntryID;
 
-    if (auto EC = Reader.readInteger(EntryOffset))
-      return errorCodeToError(object_error::parse_failed);
+    if (Error Err = Reader.readInteger(EntryOffset))
+      return Err;
 
-    if (auto EC = Reader.readInteger(EntrySize))
-      return errorCodeToError(object_error::parse_failed);
+    if (Error Err = Reader.readInteger(EntrySize))
+      return Err;
 
-    if (auto EC = Reader.readInteger(EntryIDSize))
-      return errorCodeToError(object_error::parse_failed);
+    if (Error Err = Reader.readInteger(EntryIDSize))
+      return Err;
 
-    if (auto EC = Reader.readFixedString(EntryID, EntryIDSize))
-      return errorCodeToError(object_error::parse_failed);
+    if (Error Err = Reader.readFixedString(EntryID, EntryIDSize))
+      return Err;
 
     auto Entry = std::make_unique<OffloadBundleEntry>(
         EntryOffset + SectionOffset, EntrySize, EntryIDSize, EntryID);
@@ -125,7 +125,7 @@ OffloadBundleFatBin::create(MemoryBufferRef Buf, uint64_t SectionOffset,
   // Read the Bundle Entries
   Error Err = TheBundle->readEntries(Buf.getBuffer(), SectionOffset);
   if (Err)
-    return errorCodeToError(object_error::parse_failed);
+    return Err;
 
   return std::unique_ptr<OffloadBundleFatBin>(TheBundle);
 }
diff --git a/llvm/lib/ProfileData/Coverage/CoverageMappingReader.cpp b/llvm/lib/ProfileData/Coverage/CoverageMappingReader.cpp
index fc2577e..075ad8d 100644
--- a/llvm/lib/ProfileData/Coverage/CoverageMappingReader.cpp
+++ b/llvm/lib/ProfileData/Coverage/CoverageMappingReader.cpp
@@ -949,9 +949,9 @@ loadTestingFormat(StringRef Data, StringRef CompilationDir) {
   if (Data.size() < sizeof(uint64_t))
     return make_error<CoverageMapError>(coveragemap_error::malformed,
                                         "the size of data is too small");
-  auto TestingVersion =
-      support::endian::byte_swap<uint64_t, llvm::endianness::little>(
-          *reinterpret_cast<const uint64_t *>(Data.data()));
+  auto TestingVersion = support::endian::byte_swap<uint64_t>(
+      *reinterpret_cast<const uint64_t *>(Data.data()),
+      llvm::endianness::little);
   Data = Data.substr(sizeof(uint64_t));
 
   // Read the ProfileNames data.
@@ -1274,9 +1274,9 @@ BinaryCoverageReader::create(
   std::vector<std::unique_ptr<BinaryCoverageReader>> Readers;
 
   if (ObjectBuffer.getBuffer().size() > sizeof(TestingFormatMagic)) {
-    uint64_t Magic =
-        support::endian::byte_swap<uint64_t, llvm::endianness::little>(
-            *reinterpret_cast<const uint64_t *>(ObjectBuffer.getBufferStart()));
+    uint64_t Magic = support::endian::byte_swap<uint64_t>(
+        *reinterpret_cast<const uint64_t *>(ObjectBuffer.getBufferStart()),
+        llvm::endianness::little);
     if (Magic == TestingFormatMagic) {
       // This is a special format used for testing.
       auto ReaderOrErr =
diff --git a/llvm/lib/ProfileData/Coverage/CoverageMappingWriter.cpp b/llvm/lib/ProfileData/Coverage/CoverageMappingWriter.cpp
index 12b1687..3875f01 100644
--- a/llvm/lib/ProfileData/Coverage/CoverageMappingWriter.cpp
+++ b/llvm/lib/ProfileData/Coverage/CoverageMappingWriter.cpp
@@ -292,7 +292,7 @@ void CoverageMappingWriter::write(raw_ostream &OS) {
 
 void TestingFormatWriter::write(raw_ostream &OS, TestingFormatVersion Version) {
   auto ByteSwap = [](uint64_t N) {
-    return support::endian::byte_swap<uint64_t, llvm::endianness::little>(N);
+    return support::endian::byte_swap<uint64_t>(N, llvm::endianness::little);
   };
 
   // Output a 64bit magic number.
diff --git a/llvm/lib/ProfileData/InstrProf.cpp b/llvm/lib/ProfileData/InstrProf.cpp
index e1c6315..3c8e44a 100644
--- a/llvm/lib/ProfileData/InstrProf.cpp
+++ b/llvm/lib/ProfileData/InstrProf.cpp
@@ -292,7 +292,7 @@ void ProfOStream::patch(ArrayRef<PatchItem> P) {
     for (const auto &K : P) {
       for (int I = 0, E = K.D.size(); I != E; I++) {
         uint64_t Bytes =
-            endian::byte_swap<uint64_t, llvm::endianness::little>(K.D[I]);
+            endian::byte_swap<uint64_t>(K.D[I], llvm::endianness::little);
         Data.replace(K.Pos + I * sizeof(uint64_t), sizeof(uint64_t),
                      (const char *)&Bytes, sizeof(uint64_t));
       }
diff --git a/llvm/lib/ProfileData/InstrProfReader.cpp b/llvm/lib/ProfileData/InstrProfReader.cpp
index 1da92ea..d2ae4b5 100644
--- a/llvm/lib/ProfileData/InstrProfReader.cpp
+++ b/llvm/lib/ProfileData/InstrProfReader.cpp
@@ -1186,10 +1186,10 @@ IndexedInstrProfReader::readSummary(IndexedInstrProf::ProfVersion Version,
   if (Version >= IndexedInstrProf::Version4) {
     const IndexedInstrProf::Summary *SummaryInLE =
         reinterpret_cast<const IndexedInstrProf::Summary *>(Cur);
-    uint64_t NFields = endian::byte_swap<uint64_t, llvm::endianness::little>(
-        SummaryInLE->NumSummaryFields);
-    uint64_t NEntries = endian::byte_swap<uint64_t, llvm::endianness::little>(
-        SummaryInLE->NumCutoffEntries);
+    uint64_t NFields = endian::byte_swap<uint64_t>(
+        SummaryInLE->NumSummaryFields, llvm::endianness::little);
+    uint64_t NEntries = endian::byte_swap<uint64_t>(
+        SummaryInLE->NumCutoffEntries, llvm::endianness::little);
     uint32_t SummarySize =
         IndexedInstrProf::Summary::getSize(NFields, NEntries);
     std::unique_ptr<IndexedInstrProf::Summary> SummaryData =
@@ -1198,7 +1198,7 @@ IndexedInstrProfReader::readSummary(IndexedInstrProf::ProfVersion Version,
     const uint64_t *Src = reinterpret_cast<const uint64_t *>(SummaryInLE);
     uint64_t *Dst = reinterpret_cast<uint64_t *>(SummaryData.get());
     for (unsigned I = 0; I < SummarySize / sizeof(uint64_t); I++)
-      Dst[I] = endian::byte_swap<uint64_t, llvm::endianness::little>(Src[I]);
+      Dst[I] = endian::byte_swap<uint64_t>(Src[I], llvm::endianness::little);
 
     SummaryEntryVector DetailedSummary;
     for (unsigned I = 0; I < SummaryData->NumCutoffEntries; I++) {
diff --git a/llvm/lib/Support/FileCollector.cpp b/llvm/lib/Support/FileCollector.cpp
index 5dc224a..1e5de2c 100644
--- a/llvm/lib/Support/FileCollector.cpp
+++ b/llvm/lib/Support/FileCollector.cpp
@@ -68,9 +68,8 @@ void FileCollector::PathCanonicalizer::updateWithRealPath(
   SmallString<256> RealPath;
   auto DirWithSymlink = CachedDirs.find(Directory);
   if (DirWithSymlink == CachedDirs.end()) {
-    // FIXME: Should this be a call to FileSystem::getRealpath(), in some
-    // cases? What if there is nothing on disk?
-    if (sys::fs::real_path(Directory, RealPath))
+    // FIXME: What if there is nothing on disk?
+    if (VFS->getRealPath(Directory, RealPath))
       return;
     CachedDirs[Directory] = std::string(RealPath);
   } else {
diff --git a/llvm/lib/Support/Mustache.cpp b/llvm/lib/Support/Mustache.cpp
index 686688a..8da6fdb 100644
--- a/llvm/lib/Support/Mustache.cpp
+++ b/llvm/lib/Support/Mustache.cpp
@@ -282,18 +282,15 @@ void stripTokenAhead(SmallVectorImpl<Token> &Tokens, size_t Idx) {
 // For example:
 //  The template string
 //  " \t{{#section}}A{{/section}}"
-// would be considered as having no text ahead and would be render as
+// would be considered as having no text ahead and would be render as:
 //  "A"
-// The exception for this is partial tag which requires us to
-// keep track of the indentation once it's rendered.
 void stripTokenBefore(SmallVectorImpl<Token> &Tokens, size_t Idx,
                       Token &CurrentToken, Token::Type CurrentType) {
   Token &PrevToken = Tokens[Idx - 1];
   StringRef PrevTokenBody = PrevToken.TokenBody;
   StringRef Unindented = PrevTokenBody.rtrim(" \r\t\v");
   size_t Indentation = PrevTokenBody.size() - Unindented.size();
-  if (CurrentType != Token::Type::Partial)
-    PrevToken.TokenBody = Unindented.str();
+  PrevToken.TokenBody = Unindented.str();
   CurrentToken.setIndentation(Indentation);
 }
 
@@ -439,7 +436,8 @@ class AddIndentationStringStream : public raw_ostream {
 public:
   explicit AddIndentationStringStream(llvm::raw_ostream &WrappedStream,
                                       size_t Indentation)
-      : Indentation(Indentation), WrappedStream(WrappedStream) {
+      : Indentation(Indentation), WrappedStream(WrappedStream),
+        NeedsIndent(true) {
     SetUnbuffered();
   }
 
@@ -448,10 +446,15 @@ protected:
     llvm::StringRef Data(Ptr, Size);
     SmallString<0> Indent;
     Indent.resize(Indentation, ' ');
+
     for (char C : Data) {
+      if (NeedsIndent && C != '\n') {
+        WrappedStream << Indent;
+        NeedsIndent = false;
+      }
       WrappedStream << C;
       if (C == '\n')
-        WrappedStream << Indent;
+        NeedsIndent = true;
     }
   }
 
@@ -460,6 +463,7 @@ protected:
 private:
   size_t Indentation;
   llvm::raw_ostream &WrappedStream;
+  bool NeedsIndent;
 };
 
 class Parser {
diff --git a/llvm/lib/Support/SipHash.cpp b/llvm/lib/Support/SipHash.cpp
index 86dad66..382d36f 100644
--- a/llvm/lib/Support/SipHash.cpp
+++ b/llvm/lib/Support/SipHash.cpp
@@ -35,14 +35,19 @@ void llvm::getSipHash_2_4_128(ArrayRef<uint8_t> In, const uint8_t (&K)[16],
   siphash<2, 4>(In.data(), In.size(), K, Out);
 }
 
-/// Compute an ABI-stable 16-bit hash of the given string.
-uint16_t llvm::getPointerAuthStableSipHash(StringRef Str) {
+/// Compute an ABI-stable 64-bit hash of the given string.
+uint64_t llvm::getStableSipHash(StringRef Str) {
   static const uint8_t K[16] = {0xb5, 0xd4, 0xc9, 0xeb, 0x79, 0x10, 0x4a, 0x79,
                                 0x6f, 0xec, 0x8b, 0x1b, 0x42, 0x87, 0x81, 0xd4};
 
   uint8_t RawHashBytes[8];
   getSipHash_2_4_64(arrayRefFromStringRef(Str), K, RawHashBytes);
-  uint64_t RawHash = endian::read64le(RawHashBytes);
+  return endian::read64le(RawHashBytes);
+}
+
+/// Compute an ABI-stable 16-bit hash of the given string.
+uint16_t llvm::getPointerAuthStableSipHash(StringRef Str) {
+  uint64_t RawHash = getStableSipHash(Str);
 
   // Produce a non-zero 16-bit discriminator.
   uint16_t Discriminator = (RawHash % 0xFFFF) + 1;
diff --git a/llvm/lib/Support/VirtualFileSystem.cpp b/llvm/lib/Support/VirtualFileSystem.cpp
index cf78459..7ff62d4 100644
--- a/llvm/lib/Support/VirtualFileSystem.cpp
+++ b/llvm/lib/Support/VirtualFileSystem.cpp
@@ -1973,7 +1973,7 @@ private:
           EC = FS->makeAbsolute(FullPath, Name);
           Name = canonicalize(Name);
         } else {
-          EC = sys::fs::make_absolute(Name);
+          EC = FS->makeAbsolute(Name);
         }
         if (EC) {
           assert(NameValueNode && "Name presence should be checked earlier");
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index a4c1e26..899baa9 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -8086,13 +8086,76 @@ static SDValue getZT0FrameIndex(MachineFrameInfo &MFI,
       DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout()));
 }
 
+// Emit a call to __arm_sme_save or __arm_sme_restore.
+static SDValue emitSMEStateSaveRestore(const AArch64TargetLowering &TLI,
+                                       SelectionDAG &DAG,
+                                       AArch64FunctionInfo *Info, SDLoc DL,
+                                       SDValue Chain, bool IsSave) {
+  MachineFunction &MF = DAG.getMachineFunction();
+  AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
+  FuncInfo->setSMESaveBufferUsed();
+  TargetLowering::ArgListTy Args;
+  Args.emplace_back(
+      DAG.getCopyFromReg(Chain, DL, Info->getSMESaveBufferAddr(), MVT::i64),
+      PointerType::getUnqual(*DAG.getContext()));
+
+  RTLIB::Libcall LC =
+      IsSave ? RTLIB::SMEABI_SME_SAVE : RTLIB::SMEABI_SME_RESTORE;
+  SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(LC),
+                                         TLI.getPointerTy(DAG.getDataLayout()));
+  auto *RetTy = Type::getVoidTy(*DAG.getContext());
+  TargetLowering::CallLoweringInfo CLI(DAG);
+  CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
+      TLI.getLibcallCallingConv(LC), RetTy, Callee, std::move(Args));
+  return TLI.LowerCallTo(CLI).second;
+}
+
+static SDValue emitRestoreZALazySave(SDValue Chain, SDLoc DL,
+                                     const AArch64TargetLowering &TLI,
+                                     const AArch64RegisterInfo &TRI,
+                                     AArch64FunctionInfo &FuncInfo,
+                                     SelectionDAG &DAG) {
+  // Conditionally restore the lazy save using a pseudo node.
+  RTLIB::Libcall LC = RTLIB::SMEABI_TPIDR2_RESTORE;
+  TPIDR2Object &TPIDR2 = FuncInfo.getTPIDR2Obj();
+  SDValue RegMask = DAG.getRegisterMask(TRI.getCallPreservedMask(
+      DAG.getMachineFunction(), TLI.getLibcallCallingConv(LC)));
+  SDValue RestoreRoutine = DAG.getTargetExternalSymbol(
+      TLI.getLibcallName(LC), TLI.getPointerTy(DAG.getDataLayout()));
+  SDValue TPIDR2_EL0 = DAG.getNode(
+      ISD::INTRINSIC_W_CHAIN, DL, MVT::i64, Chain,
+      DAG.getConstant(Intrinsic::aarch64_sme_get_tpidr2, DL, MVT::i32));
+  // Copy the address of the TPIDR2 block into X0 before 'calling' the
+  // RESTORE_ZA pseudo.
+  SDValue Glue;
+  SDValue TPIDR2Block = DAG.getFrameIndex(
+      TPIDR2.FrameIndex,
+      DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout()));
+  Chain = DAG.getCopyToReg(Chain, DL, AArch64::X0, TPIDR2Block, Glue);
+  Chain =
+      DAG.getNode(AArch64ISD::RESTORE_ZA, DL, MVT::Other,
+                  {Chain, TPIDR2_EL0, DAG.getRegister(AArch64::X0, MVT::i64),
+                   RestoreRoutine, RegMask, Chain.getValue(1)});
+  // Finally reset the TPIDR2_EL0 register to 0.
+  Chain = DAG.getNode(
+      ISD::INTRINSIC_VOID, DL, MVT::Other, Chain,
+      DAG.getConstant(Intrinsic::aarch64_sme_set_tpidr2, DL, MVT::i32),
+      DAG.getConstant(0, DL, MVT::i64));
+  TPIDR2.Uses++;
+  return Chain;
+}
+
 SDValue AArch64TargetLowering::lowerEHPadEntry(SDValue Chain, SDLoc const &DL,
                                                SelectionDAG &DAG) const {
   assert(Chain.getOpcode() == ISD::EntryToken && "Unexpected Chain value");
   SDValue Glue = Chain.getValue(1);
 
   MachineFunction &MF = DAG.getMachineFunction();
-  SMEAttrs SMEFnAttrs = MF.getInfo<AArch64FunctionInfo>()->getSMEFnAttrs();
+  auto &FuncInfo = *MF.getInfo<AArch64FunctionInfo>();
+  auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
+  const AArch64RegisterInfo &TRI = *Subtarget.getRegisterInfo();
+
+  SMEAttrs SMEFnAttrs = FuncInfo.getSMEFnAttrs();
 
   // The following conditions are true on entry to an exception handler:
   // - PSTATE.SM is 0.
@@ -8107,14 +8170,43 @@ SDValue AArch64TargetLowering::lowerEHPadEntry(SDValue Chain, SDLoc const &DL,
   // These mode changes are usually optimized away in catch blocks as they
   // occur before the __cxa_begin_catch (which is a non-streaming function),
   // but are necessary in some cases (such as for cleanups).
+  //
+  // Additionally, if the function has ZA or ZT0 state, we must restore it.
 
+  // [COND_]SMSTART SM
   if (SMEFnAttrs.hasStreamingInterfaceOrBody())
-    return changeStreamingMode(DAG, DL, /*Enable=*/true, Chain,
-                               /*Glue*/ Glue, AArch64SME::Always);
+    Chain = changeStreamingMode(DAG, DL, /*Enable=*/true, Chain,
+                                /*Glue*/ Glue, AArch64SME::Always);
+  else if (SMEFnAttrs.hasStreamingCompatibleInterface())
+    Chain = changeStreamingMode(DAG, DL, /*Enable=*/true, Chain, Glue,
+                                AArch64SME::IfCallerIsStreaming);
 
-  if (SMEFnAttrs.hasStreamingCompatibleInterface())
-    return changeStreamingMode(DAG, DL, /*Enable=*/true, Chain, Glue,
-                               AArch64SME::IfCallerIsStreaming);
+  if (getTM().useNewSMEABILowering())
+    return Chain;
+
+  if (SMEFnAttrs.hasAgnosticZAInterface()) {
+    // Restore full ZA
+    Chain = emitSMEStateSaveRestore(*this, DAG, &FuncInfo, DL, Chain,
+                                    /*IsSave=*/false);
+  } else if (SMEFnAttrs.hasZAState() || SMEFnAttrs.hasZT0State()) {
+    // SMSTART ZA
+    Chain = DAG.getNode(
+        AArch64ISD::SMSTART, DL, DAG.getVTList(MVT::Other, MVT::Glue), Chain,
+        DAG.getTargetConstant(int32_t(AArch64SVCR::SVCRZA), DL, MVT::i32));
+
+    // Restore ZT0
+    if (SMEFnAttrs.hasZT0State()) {
+      SDValue ZT0FrameIndex =
+          getZT0FrameIndex(MF.getFrameInfo(), FuncInfo, DAG);
+      Chain =
+          DAG.getNode(AArch64ISD::RESTORE_ZT, DL, DAG.getVTList(MVT::Other),
+                      {Chain, DAG.getConstant(0, DL, MVT::i32), ZT0FrameIndex});
+    }
+
+    // Restore ZA
+    if (SMEFnAttrs.hasZAState())
+      Chain = emitRestoreZALazySave(Chain, DL, *this, TRI, FuncInfo, DAG);
+  }
 
   return Chain;
 }
@@ -9232,30 +9324,6 @@ SDValue AArch64TargetLowering::changeStreamingMode(
   return GetCheckVL(SMChange.getValue(0), SMChange.getValue(1));
 }
 
-// Emit a call to __arm_sme_save or __arm_sme_restore.
-static SDValue emitSMEStateSaveRestore(const AArch64TargetLowering &TLI,
-                                       SelectionDAG &DAG,
-                                       AArch64FunctionInfo *Info, SDLoc DL,
-                                       SDValue Chain, bool IsSave) {
-  MachineFunction &MF = DAG.getMachineFunction();
-  AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
-  FuncInfo->setSMESaveBufferUsed();
-  TargetLowering::ArgListTy Args;
-  Args.emplace_back(
-      DAG.getCopyFromReg(Chain, DL, Info->getSMESaveBufferAddr(), MVT::i64),
-      PointerType::getUnqual(*DAG.getContext()));
-
-  RTLIB::Libcall LC =
-      IsSave ? RTLIB::SMEABI_SME_SAVE : RTLIB::SMEABI_SME_RESTORE;
-  SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(LC),
-                                         TLI.getPointerTy(DAG.getDataLayout()));
-  auto *RetTy = Type::getVoidTy(*DAG.getContext());
-  TargetLowering::CallLoweringInfo CLI(DAG);
-  CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
-      TLI.getLibcallCallingConv(LC), RetTy, Callee, std::move(Args));
-  return TLI.LowerCallTo(CLI).second;
-}
-
 static AArch64SME::ToggleCondition
 getSMToggleCondition(const SMECallAttrs &CallAttrs) {
   if (!CallAttrs.caller().hasStreamingCompatibleInterface() ||
@@ -10015,33 +10083,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
                     {Result, DAG.getConstant(0, DL, MVT::i32), ZTFrameIdx});
 
   if (RequiresLazySave) {
-    // Conditionally restore the lazy save using a pseudo node.
-    RTLIB::Libcall LC = RTLIB::SMEABI_TPIDR2_RESTORE;
-    TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();
-    SDValue RegMask = DAG.getRegisterMask(
-        TRI->getCallPreservedMask(MF, getLibcallCallingConv(LC)));
-    SDValue RestoreRoutine = DAG.getTargetExternalSymbol(
-        getLibcallName(LC), getPointerTy(DAG.getDataLayout()));
-    SDValue TPIDR2_EL0 = DAG.getNode(
-        ISD::INTRINSIC_W_CHAIN, DL, MVT::i64, Result,
-        DAG.getConstant(Intrinsic::aarch64_sme_get_tpidr2, DL, MVT::i32));
-    // Copy the address of the TPIDR2 block into X0 before 'calling' the
-    // RESTORE_ZA pseudo.
-    SDValue Glue;
-    SDValue TPIDR2Block = DAG.getFrameIndex(
-        TPIDR2.FrameIndex,
-        DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout()));
-    Result = DAG.getCopyToReg(Result, DL, AArch64::X0, TPIDR2Block, Glue);
-    Result =
-        DAG.getNode(AArch64ISD::RESTORE_ZA, DL, MVT::Other,
-                    {Result, TPIDR2_EL0, DAG.getRegister(AArch64::X0, MVT::i64),
-                     RestoreRoutine, RegMask, Result.getValue(1)});
-    // Finally reset the TPIDR2_EL0 register to 0.
-    Result = DAG.getNode(
-        ISD::INTRINSIC_VOID, DL, MVT::Other, Result,
-        DAG.getConstant(Intrinsic::aarch64_sme_set_tpidr2, DL, MVT::i32),
-        DAG.getConstant(0, DL, MVT::i64));
-    TPIDR2.Uses++;
+    Result = emitRestoreZALazySave(Result, DL, *this, *TRI, *FuncInfo, DAG);
   } else if (RequiresSaveAllZA) {
     Result = emitSMEStateSaveRestore(*this, DAG, FuncInfo, DL, Result,
                                      /*IsSave=*/false);
@@ -11736,6 +11778,28 @@ SDValue AArch64TargetLowering::LowerSELECT_CC(
       return DAG.getNode(ISD::AND, DL, VT, LHS, Shift);
     }
 
+    // Check for sign bit test patterns that can use TST optimization.
+    // (SELECT_CC setlt, sign_extend_inreg, 0, tval, fval)
+    //                          -> TST %operand, sign_bit; CSEL
+    // (SELECT_CC setlt, sign_extend, 0, tval, fval)
+    //                          -> TST %operand, sign_bit; CSEL
+    if (CC == ISD::SETLT && RHSC && RHSC->isZero() && LHS.hasOneUse() &&
+        (LHS.getOpcode() == ISD::SIGN_EXTEND_INREG ||
+         LHS.getOpcode() == ISD::SIGN_EXTEND)) {
+
+      uint64_t SignBitPos;
+      std::tie(LHS, SignBitPos) = lookThroughSignExtension(LHS);
+      EVT TestVT = LHS.getValueType();
+      SDValue SignBitConst = DAG.getConstant(1ULL << SignBitPos, DL, TestVT);
+      SDValue TST =
+          DAG.getNode(AArch64ISD::ANDS, DL, DAG.getVTList(TestVT, MVT::i32),
+                      LHS, SignBitConst);
+
+      SDValue Flags = TST.getValue(1);
+      return DAG.getNode(AArch64ISD::CSEL, DL, TVal.getValueType(), TVal, FVal,
+                         DAG.getConstant(AArch64CC::NE, DL, MVT::i32), Flags);
+    }
+
     // Canonicalise absolute difference patterns:
     //   select_cc lhs, rhs, sub(lhs, rhs), sub(rhs, lhs), cc ->
     //   select_cc lhs, rhs, sub(lhs, rhs), neg(sub(lhs, rhs)), cc
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 04b3c90..f788c75 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -9907,8 +9907,14 @@ def : Pat<(v4bf16 (bitconvert (v2f32 FPR64:$src))),
 def : Pat<(v4bf16 (bitconvert (v1f64 FPR64:$src))),
           (v4bf16 (REV64v4i16 FPR64:$src))>;
 }
-def : Pat<(v4f16 (bitconvert (v4i16 FPR64:$src))), (v4f16 FPR64:$src)>;
-def : Pat<(v4bf16 (bitconvert (v4i16 FPR64:$src))), (v4bf16 FPR64:$src)>;
+def : Pat<(v4f16 (bitconvert (v4i16 FPR64:$src))),
+          (v4f16 FPR64:$src)>;
+def : Pat<(v4f16 (bitconvert (v4bf16 FPR64:$src))),
+          (v4f16 FPR64:$src)>;
+def : Pat<(v4bf16 (bitconvert (v4i16 FPR64:$src))),
+          (v4bf16 FPR64:$src)>;
+def : Pat<(v4bf16 (bitconvert (v4f16 FPR64:$src))),
+          (v4bf16 FPR64:$src)>;
 
 let Predicates = [IsLE] in {
 def : Pat<(v8i8  (bitconvert (v1i64 FPR64:$src))), (v8i8  FPR64:$src)>;
@@ -10236,8 +10242,14 @@ def : Pat<(v8bf16 (bitconvert (v2f64 FPR128:$src))),
 def : Pat<(v8bf16 (bitconvert (v4f32 FPR128:$src))),
           (v8bf16 (REV32v8i16 FPR128:$src))>;
 }
-def : Pat<(v8f16 (bitconvert (v8i16 FPR128:$src))), (v8f16 FPR128:$src)>;
-def : Pat<(v8bf16 (bitconvert (v8i16 FPR128:$src))), (v8bf16 FPR128:$src)>;
+def : Pat<(v8f16 (bitconvert (v8i16 FPR128:$src))),
+          (v8f16 FPR128:$src)>;
+def : Pat<(v8bf16 (bitconvert (v8i16 FPR128:$src))),
+          (v8bf16 FPR128:$src)>;
+def : Pat<(v8f16 (bitconvert (v8bf16 FPR128:$src))),
+          (v8f16 FPR128:$src)>;
+def : Pat<(v8bf16 (bitconvert (v8f16 FPR128:$src))),
+          (v8bf16 FPR128:$src)>;
 
 let Predicates = [IsLE] in {
 def : Pat<(v16i8 (bitconvert (f128  FPR128:$src))), (v16i8 FPR128:$src)>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
index 38718c4..7504f1a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
@@ -150,7 +150,10 @@ public:
       if (!CVisited.insert(CII).second)
         continue;
 
-      if (CII->getParent() == II->getParent() && !IsLookThru(II))
+      // Same-BB filter must look at the *user*; and allow non-lookthrough
+      // users when the def is a PHI (loop-header pattern).
+      if (CII->getParent() == II->getParent() && !IsLookThru(CII) &&
+          !isa<PHINode>(II))
         continue;
 
       if (isOpLegal(CII))
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
index d9bfeae..0a59132 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
@@ -2562,7 +2562,9 @@ bool AMDGPULowerBufferFatPointers::run(Module &M, const TargetMachine &TM) {
   for (Function *F : NeedsPostProcess)
     Splitter.processFunction(*F);
   for (Function *F : Intrinsics) {
-    if (isRemovablePointerIntrinsic(F->getIntrinsicID())) {
+    // use_empty() can also occur with cases like masked load, which will
+    // have been rewritten out of the module by now but not erased.
+    if (F->use_empty() || isRemovablePointerIntrinsic(F->getIntrinsicID())) {
       F->eraseFromParent();
     } else {
       std::optional<Function *> NewF = Intrinsic::remangleIntrinsicFunction(F);
diff --git a/llvm/lib/Target/ARM/ARMAsmPrinter.cpp b/llvm/lib/Target/ARM/ARMAsmPrinter.cpp
index 2381effb..1f773e2 100644
--- a/llvm/lib/Target/ARM/ARMAsmPrinter.cpp
+++ b/llvm/lib/Target/ARM/ARMAsmPrinter.cpp
@@ -632,6 +632,19 @@ static bool checkDenormalAttributeConsistency(const Module &M, StringRef Attr,
   });
 }
 
+// Returns true if all functions have different denormal modes.
+static bool checkDenormalAttributeInconsistency(const Module &M) {
+  auto F = M.functions().begin();
+  auto E = M.functions().end();
+  if (F == E)
+    return false;
+  DenormalMode Value = F->getDenormalModeRaw();
+  ++F;
+  return std::any_of(F, E, [&](const Function &F) {
+    return !F.isDeclaration() && F.getDenormalModeRaw() != Value;
+  });
+}
+
 void ARMAsmPrinter::emitAttributes() {
   MCTargetStreamer &TS = *OutStreamer->getTargetStreamer();
   ARMTargetStreamer &ATS = static_cast<ARMTargetStreamer &>(TS);
@@ -698,7 +711,9 @@ void ARMAsmPrinter::emitAttributes() {
                                              DenormalMode::getPositiveZero()))
     ATS.emitAttribute(ARMBuildAttrs::ABI_FP_denormal,
                       ARMBuildAttrs::PositiveZero);
-  else if (!TM.Options.UnsafeFPMath)
+  else if (checkDenormalAttributeInconsistency(*MMI->getModule()) ||
+           checkDenormalAttributeConsistency(
+               *MMI->getModule(), "denormal-fp-math", DenormalMode::getIEEE()))
     ATS.emitAttribute(ARMBuildAttrs::ABI_FP_denormal,
                       ARMBuildAttrs::IEEEDenormals);
   else {
@@ -733,7 +748,7 @@ void ARMAsmPrinter::emitAttributes() {
       TM.Options.NoTrappingFPMath)
     ATS.emitAttribute(ARMBuildAttrs::ABI_FP_exceptions,
                       ARMBuildAttrs::Not_Allowed);
-  else if (!TM.Options.UnsafeFPMath) {
+  else {
     ATS.emitAttribute(ARMBuildAttrs::ABI_FP_exceptions, ARMBuildAttrs::Allowed);
 
     // If the user has permitted this code to choose the IEEE 754
diff --git a/llvm/lib/Target/Hexagon/RDFCopy.cpp b/llvm/lib/Target/Hexagon/RDFCopy.cpp
index fafdad0..3b1d3bd 100644
--- a/llvm/lib/Target/Hexagon/RDFCopy.cpp
+++ b/llvm/lib/Target/Hexagon/RDFCopy.cpp
@@ -108,7 +108,7 @@ bool CopyPropagation::scanBlock(MachineBasicBlock *B) {
   for (NodeAddr<InstrNode*> IA : BA.Addr->members(DFG)) {
     if (DFG.IsCode<NodeAttrs::Stmt>(IA)) {
       NodeAddr<StmtNode*> SA = IA;
-      EqualityMap EM(std::less<RegisterRef>(DFG.getPRI()));
+      EqualityMap EM(RegisterRefLess(DFG.getPRI()));
       if (interpretAsCopy(SA.Addr->getCode(), EM))
         recordCopy(SA, EM);
     }
diff --git a/llvm/lib/Target/Hexagon/RDFCopy.h b/llvm/lib/Target/Hexagon/RDFCopy.h
index e4fb898..92b2c65 100644
--- a/llvm/lib/Target/Hexagon/RDFCopy.h
+++ b/llvm/lib/Target/Hexagon/RDFCopy.h
@@ -25,8 +25,8 @@ class MachineInstr;
 namespace rdf {
 
   struct CopyPropagation {
-    CopyPropagation(DataFlowGraph &dfg) : MDT(dfg.getDT()), DFG(dfg),
-        RDefMap(std::less<RegisterRef>(DFG.getPRI())) {}
+    CopyPropagation(DataFlowGraph &dfg)
+        : MDT(dfg.getDT()), DFG(dfg), RDefMap(RegisterRefLess(DFG.getPRI())) {}
 
     virtual ~CopyPropagation() = default;
 
@@ -35,7 +35,7 @@ namespace rdf {
     bool trace() const { return Trace; }
     DataFlowGraph &getDFG() { return DFG; }
 
-    using EqualityMap = std::map<RegisterRef, RegisterRef>;
+    using EqualityMap = std::map<RegisterRef, RegisterRef, RegisterRefLess>;
     virtual bool interpretAsCopy(const MachineInstr *MI, EqualityMap &EM);
 
   private:
@@ -45,7 +45,7 @@ namespace rdf {
     bool Trace = false;
 
     // map: register -> (map: stmt -> reaching def)
-    std::map<RegisterRef,std::map<NodeId,NodeId>> RDefMap;
+    std::map<RegisterRef, std::map<NodeId, NodeId>, RegisterRefLess> RDefMap;
     // map: statement -> (map: dst reg -> src reg)
     std::map<NodeId, EqualityMap> CopyMap;
     std::vector<NodeId> Copies;
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
index 94f53d5..098bcfa 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
@@ -340,6 +340,7 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM,
           {ISD::SETNE, ISD::SETGE, ISD::SETGT, ISD::SETUGE, ISD::SETUGT}, VT,
           Expand);
       setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
+      setOperationAction(ISD::ABS, VT, Legal);
       setOperationAction(ISD::ABDS, VT, Legal);
       setOperationAction(ISD::ABDU, VT, Legal);
       setOperationAction(ISD::SADDSAT, VT, Legal);
@@ -419,6 +420,7 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM,
           {ISD::SETNE, ISD::SETGE, ISD::SETGT, ISD::SETUGE, ISD::SETUGT}, VT,
           Expand);
       setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
+      setOperationAction(ISD::ABS, VT, Legal);
       setOperationAction(ISD::ABDS, VT, Legal);
       setOperationAction(ISD::ABDU, VT, Legal);
       setOperationAction(ISD::SADDSAT, VT, Legal);
@@ -9557,3 +9559,20 @@ bool LoongArchTargetLowering::shouldScalarizeBinop(SDValue VecOp) const {
   EVT ScalarVT = VecVT.getScalarType();
   return isOperationLegalOrCustomOrPromote(Opc, ScalarVT);
 }
+
+bool LoongArchTargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
+                                                      unsigned Index) const {
+  if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
+    return false;
+
+  // Extract a 128-bit subvector from index 0 of a 256-bit vector is free.
+  return Index == 0;
+}
+
+bool LoongArchTargetLowering::isExtractVecEltCheap(EVT VT,
+                                                   unsigned Index) const {
+  EVT EltVT = VT.getScalarType();
+
+  // Extract a scalar FP value from index 0 of a vector is free.
+  return (EltVT == MVT::f32 || EltVT == MVT::f64) && Index == 0;
+}
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
index 3c00296..9b60a9f 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
@@ -338,6 +338,9 @@ public:
                                          unsigned Depth) const override;
 
   bool shouldScalarizeBinop(SDValue VecOp) const override;
+  bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
+                               unsigned Index) const override;
+  bool isExtractVecEltCheap(EVT VT, unsigned Index) const override;
 
   /// Check if a constant splat can be generated using [x]vldi, where imm[12]
   /// is 1.
diff --git a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
index adfe990..bbc0489 100644
--- a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
@@ -2015,10 +2015,26 @@ def : Pat<(v4i32(fp_to_uint v4f64:$vj)),
                (XVFTINTRZ_LU_D v4f64:$vj)),
               sub_128)>;
 
+// abs
+def : Pat<(abs v32i8:$xj), (XVMAX_B v32i8:$xj, (XVNEG_B v32i8:$xj))>;
+def : Pat<(abs v16i16:$xj), (XVMAX_H v16i16:$xj, (XVNEG_H v16i16:$xj))>;
+def : Pat<(abs v8i32:$xj), (XVMAX_W v8i32:$xj, (XVNEG_W v8i32:$xj))>;
+def : Pat<(abs v4i64:$xj), (XVMAX_D v4i64:$xj, (XVNEG_D v4i64:$xj))>;
+
 // XVABSD_{B/H/W/D}[U]
 defm : PatXrXr<abds, "XVABSD">;
 defm : PatXrXrU<abdu, "XVABSD">;
 
+// XVADDA_{B/H/W/D}
+def : Pat<(add (v32i8 (abs v32i8:$xj)), (v32i8 (abs v32i8:$xk))),
+          (XVADDA_B v32i8:$xj, v32i8:$xk)>;
+def : Pat<(add (v16i16 (abs v16i16:$xj)), (v16i16 (abs v16i16:$xk))),
+          (XVADDA_H v16i16:$xj, v16i16:$xk)>;
+def : Pat<(add (v8i32 (abs v8i32:$xj)), (v8i32 (abs v8i32:$xk))),
+          (XVADDA_W v8i32:$xj, v8i32:$xk)>;
+def : Pat<(add (v4i64 (abs v4i64:$xj)), (v4i64 (abs v4i64:$xk))),
+          (XVADDA_D v4i64:$xj, v4i64:$xk)>;
+
 // XVSADD_{B/H/W/D}[U], XVSSUB_{B/H/W/D}[U]
 defm : PatXrXr<saddsat, "XVSADD">;
 defm : PatXrXr<ssubsat, "XVSSUB">;
diff --git a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
index 2c36099..8d1dc99 100644
--- a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
@@ -2154,10 +2154,26 @@ def : Pat<(f32 f32imm_vldi:$in),
 def : Pat<(f64 f64imm_vldi:$in),
           (f64 (EXTRACT_SUBREG (VLDI (to_f64imm_vldi f64imm_vldi:$in)), sub_64))>;
 
+// abs
+def : Pat<(abs v16i8:$vj), (VMAX_B v16i8:$vj, (VNEG_B v16i8:$vj))>;
+def : Pat<(abs v8i16:$vj), (VMAX_H v8i16:$vj, (VNEG_H v8i16:$vj))>;
+def : Pat<(abs v4i32:$vj), (VMAX_W v4i32:$vj, (VNEG_W v4i32:$vj))>;
+def : Pat<(abs v2i64:$vj), (VMAX_D v2i64:$vj, (VNEG_D v2i64:$vj))>;
+
 // VABSD_{B/H/W/D}[U]
 defm : PatVrVr<abds, "VABSD">;
 defm : PatVrVrU<abdu, "VABSD">;
 
+// VADDA_{B/H/W/D}
+def : Pat<(add (v16i8 (abs v16i8:$vj)), (v16i8 (abs v16i8:$vk))),
+          (VADDA_B v16i8:$vj, v16i8:$vk)>;
+def : Pat<(add (v8i16 (abs v8i16:$vj)), (v8i16 (abs v8i16:$vk))),
+          (VADDA_H v8i16:$vj, v8i16:$vk)>;
+def : Pat<(add (v4i32 (abs v4i32:$vj)), (v4i32 (abs v4i32:$vk))),
+          (VADDA_W v4i32:$vj, v4i32:$vk)>;
+def : Pat<(add (v2i64 (abs v2i64:$vj)), (v2i64 (abs v2i64:$vk))),
+          (VADDA_D v2i64:$vj, v2i64:$vk)>;
+
 // VSADD_{B/H/W/D}[U], VSSUB_{B/H/W/D}[U]
 defm : PatVrVr<saddsat, "VSADD">;
 defm : PatVrVr<ssubsat, "VSSUB">;
diff --git a/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp b/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp
index d0a8aba..c5e26c1 100644
--- a/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp
@@ -57,6 +57,11 @@ static cl::opt<bool>
                            cl::desc("Enable the loop data prefetch pass"),
                            cl::init(false));
 
+static cl::opt<bool>
+    EnableMergeBaseOffset("loongarch-enable-merge-offset",
+                          cl::desc("Enable the merge base offset pass"),
+                          cl::init(true), cl::Hidden);
+
 static Reloc::Model getEffectiveRelocModel(const Triple &TT,
                                            std::optional<Reloc::Model> RM) {
   return RM.value_or(Reloc::Static);
@@ -214,7 +219,7 @@ void LoongArchPassConfig::addMachineSSAOptimization() {
 
 void LoongArchPassConfig::addPreRegAlloc() {
   addPass(createLoongArchPreRAExpandPseudoPass());
-  if (TM->getOptLevel() != CodeGenOptLevel::None)
+  if (TM->getOptLevel() != CodeGenOptLevel::None && EnableMergeBaseOffset)
     addPass(createLoongArchMergeBaseOffsetOptPass());
 }
 
diff --git a/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp b/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp
index cb57c43..d4d9e54 100644
--- a/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp
@@ -193,7 +193,7 @@ bool RISCVExpandPseudo::expandCCOp(MachineBasicBlock &MBB,
   // we need to invert the branch condition to jump over TrueBB when the
   // condition is false.
   auto CC = static_cast<RISCVCC::CondCode>(MI.getOperand(3).getImm());
-  CC = RISCVCC::getOppositeBranchCondition(CC);
+  CC = RISCVCC::getInverseBranchCondition(CC);
 
   // Insert branch instruction.
   BuildMI(MBB, MBBI, DL, TII->get(RISCVCC::getBrCond(CC)))
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
index 56db09a..6d418fd 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -1134,7 +1134,7 @@ unsigned RISCVCC::getBrCond(RISCVCC::CondCode CC, unsigned SelectOpc) {
   }
 }
 
-RISCVCC::CondCode RISCVCC::getOppositeBranchCondition(RISCVCC::CondCode CC) {
+RISCVCC::CondCode RISCVCC::getInverseBranchCondition(RISCVCC::CondCode CC) {
   switch (CC) {
   default:
     llvm_unreachable("Unrecognized conditional branch");
@@ -1554,7 +1554,7 @@ bool RISCVInstrInfo::optimizeCondBranch(MachineInstr &MI) const {
     return Register();
   };
 
-  unsigned NewOpc = RISCVCC::getBrCond(getOppositeBranchCondition(CC));
+  unsigned NewOpc = RISCVCC::getBrCond(getInverseBranchCondition(CC));
 
   // Might be case 1.
   // Don't change 0 to 1 since we can use x0.
@@ -1801,7 +1801,7 @@ RISCVInstrInfo::optimizeSelect(MachineInstr &MI,
   // Add condition code, inverting if necessary.
   auto CC = static_cast<RISCVCC::CondCode>(MI.getOperand(3).getImm());
   if (Invert)
-    CC = RISCVCC::getOppositeBranchCondition(CC);
+    CC = RISCVCC::getInverseBranchCondition(CC);
   NewMI.addImm(CC);
 
   // Copy the false register.
@@ -3978,7 +3978,7 @@ MachineInstr *RISCVInstrInfo::commuteInstructionImpl(MachineInstr &MI,
   case RISCV::PseudoCCMOVGPR: {
     // CCMOV can be commuted by inverting the condition.
     auto CC = static_cast<RISCVCC::CondCode>(MI.getOperand(3).getImm());
-    CC = RISCVCC::getOppositeBranchCondition(CC);
+    CC = RISCVCC::getInverseBranchCondition(CC);
     auto &WorkingMI = cloneIfNew(MI);
     WorkingMI.getOperand(3).setImm(CC);
     return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI*/ false,
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.h b/llvm/lib/Target/RISCV/RISCVInstrInfo.h
index 2bc499b..42a0c4c 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.h
@@ -44,7 +44,7 @@ enum CondCode {
   COND_INVALID
 };
 
-CondCode getOppositeBranchCondition(CondCode);
+CondCode getInverseBranchCondition(CondCode);
 unsigned getBrCond(CondCode CC, unsigned SelectOpc = 0);
 
 } // end of namespace RISCVCC
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index 563f3bb..d4124ae 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -167,6 +167,42 @@ static bool canUseShiftPair(Instruction *Inst, const APInt &Imm) {
   return false;
 }
 
+// If this is i64 AND is part of (X & -(1 << C1) & 0xffffffff) == C2 << C1),
+// DAGCombiner can convert this to (sraiw X, C1) == sext(C2) for RV64. On RV32,
+// the type will be split so only the lower 32 bits need to be compared using
+// (srai/srli X, C) == C2.
+static bool canUseShiftCmp(Instruction *Inst, const APInt &Imm) {
+  if (!Inst->hasOneUse())
+    return false;
+
+  // Look for equality comparison.
+  auto *Cmp = dyn_cast<ICmpInst>(*Inst->user_begin());
+  if (!Cmp || !Cmp->isEquality())
+    return false;
+
+  // Right hand side of comparison should be a constant.
+  auto *C = dyn_cast<ConstantInt>(Cmp->getOperand(1));
+  if (!C)
+    return false;
+
+  uint64_t Mask = Imm.getZExtValue();
+
+  // Mask should be of the form -(1 << C) in the lower 32 bits.
+  if (!isUInt<32>(Mask) || !isPowerOf2_32(-uint32_t(Mask)))
+    return false;
+
+  // Comparison constant should be a subset of Mask.
+  uint64_t CmpC = C->getZExtValue();
+  if ((CmpC & Mask) != CmpC)
+    return false;
+
+  // We'll need to sign extend the comparison constant and shift it right. Make
+  // sure the new constant can use addi/xori+seqz/snez.
+  unsigned ShiftBits = llvm::countr_zero(Mask);
+  int64_t NewCmpC = SignExtend64<32>(CmpC) >> ShiftBits;
+  return NewCmpC >= -2048 && NewCmpC <= 2048;
+}
+
 InstructionCost RISCVTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
                                                 const APInt &Imm, Type *Ty,
                                                 TTI::TargetCostKind CostKind,
@@ -224,6 +260,9 @@ InstructionCost RISCVTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
     if (Inst && Idx == 1 && Imm.getBitWidth() <= ST->getXLen() &&
         canUseShiftPair(Inst, Imm))
       return TTI::TCC_Free;
+    if (Inst && Idx == 1 && Imm.getBitWidth() == 64 &&
+        canUseShiftCmp(Inst, Imm))
+      return TTI::TCC_Free;
     Takes12BitImm = true;
     break;
   case Instruction::Add:
diff --git a/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp b/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp
index 86f4459..f704d3a 100644
--- a/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp
@@ -1096,6 +1096,41 @@ static bool build2DBlockIOINTELInst(const SPIRV::IncomingCall *Call,
   return true;
 }
 
+static bool buildPipeInst(const SPIRV::IncomingCall *Call, unsigned Opcode,
+                          unsigned Scope, MachineIRBuilder &MIRBuilder,
+                          SPIRVGlobalRegistry *GR) {
+  switch (Opcode) {
+  case SPIRV::OpCommitReadPipe:
+  case SPIRV::OpCommitWritePipe:
+    return buildOpFromWrapper(MIRBuilder, Opcode, Call, Register(0));
+  case SPIRV::OpGroupCommitReadPipe:
+  case SPIRV::OpGroupCommitWritePipe:
+  case SPIRV::OpGroupReserveReadPipePackets:
+  case SPIRV::OpGroupReserveWritePipePackets: {
+    Register ScopeConstReg =
+        MIRBuilder.buildConstant(LLT::scalar(32), Scope).getReg(0);
+    MachineRegisterInfo *MRI = MIRBuilder.getMRI();
+    MRI->setRegClass(ScopeConstReg, &SPIRV::iIDRegClass);
+    MachineInstrBuilder MIB;
+    MIB = MIRBuilder.buildInstr(Opcode);
+    // Add Return register and type.
+    if (Opcode == SPIRV::OpGroupReserveReadPipePackets ||
+        Opcode == SPIRV::OpGroupReserveWritePipePackets)
+      MIB.addDef(Call->ReturnRegister)
+          .addUse(GR->getSPIRVTypeID(Call->ReturnType));
+
+    MIB.addUse(ScopeConstReg);
+    for (unsigned int i = 0; i < Call->Arguments.size(); ++i)
+      MIB.addUse(Call->Arguments[i]);
+
+    return true;
+  }
+  default:
+    return buildOpFromWrapper(MIRBuilder, Opcode, Call,
+                              GR->getSPIRVTypeID(Call->ReturnType));
+  }
+}
+
 static unsigned getNumComponentsForDim(SPIRV::Dim::Dim dim) {
   switch (dim) {
   case SPIRV::Dim::DIM_1D:
@@ -2350,6 +2385,20 @@ static bool generate2DBlockIOINTELInst(const SPIRV::IncomingCall *Call,
   return build2DBlockIOINTELInst(Call, Opcode, MIRBuilder, GR);
 }
 
+static bool generatePipeInst(const SPIRV::IncomingCall *Call,
+                             MachineIRBuilder &MIRBuilder,
+                             SPIRVGlobalRegistry *GR) {
+  const SPIRV::DemangledBuiltin *Builtin = Call->Builtin;
+  unsigned Opcode =
+      SPIRV::lookupNativeBuiltin(Builtin->Name, Builtin->Set)->Opcode;
+
+  unsigned Scope = SPIRV::Scope::Workgroup;
+  if (Builtin->Name.contains("sub_group"))
+    Scope = SPIRV::Scope::Subgroup;
+
+  return buildPipeInst(Call, Opcode, Scope, MIRBuilder, GR);
+}
+
 static bool buildNDRange(const SPIRV::IncomingCall *Call,
                          MachineIRBuilder &MIRBuilder,
                          SPIRVGlobalRegistry *GR) {
@@ -2948,6 +2997,8 @@ std::optional<bool> lowerBuiltin(const StringRef DemangledCall,
     return generateTernaryBitwiseFunctionINTELInst(Call.get(), MIRBuilder, GR);
   case SPIRV::Block2DLoadStore:
     return generate2DBlockIOINTELInst(Call.get(), MIRBuilder, GR);
+  case SPIRV::Pipe:
+    return generatePipeInst(Call.get(), MIRBuilder, GR);
   }
   return false;
 }
diff --git a/llvm/lib/Target/SPIRV/SPIRVBuiltins.td b/llvm/lib/Target/SPIRV/SPIRVBuiltins.td
index d08560b..2a8deb6 100644
--- a/llvm/lib/Target/SPIRV/SPIRVBuiltins.td
+++ b/llvm/lib/Target/SPIRV/SPIRVBuiltins.td
@@ -69,6 +69,7 @@ def ExtendedBitOps : BuiltinGroup;
 def BindlessINTEL : BuiltinGroup;
 def TernaryBitwiseINTEL : BuiltinGroup;
 def Block2DLoadStore : BuiltinGroup;
+def Pipe : BuiltinGroup;
 
 //===----------------------------------------------------------------------===//
 // Class defining a demangled builtin record. The information in the record
@@ -633,6 +634,29 @@ defm : DemangledNativeBuiltin<"__spirv_AtomicSMax", OpenCL_std, Atomic, 4, 4, Op
 defm : DemangledNativeBuiltin<"__spirv_AtomicUMin", OpenCL_std, Atomic, 4, 4, OpAtomicUMin>;
 defm : DemangledNativeBuiltin<"__spirv_AtomicUMax", OpenCL_std, Atomic, 4, 4, OpAtomicUMax>;
 
+// Pipe Instruction 
+defm : DemangledNativeBuiltin<"__read_pipe_2", OpenCL_std, Pipe,2, 2, OpReadPipe>;
+defm : DemangledNativeBuiltin<"__write_pipe_2", OpenCL_std, Pipe, 2, 2, OpWritePipe>;
+defm : DemangledNativeBuiltin<"__read_pipe_4", OpenCL_std, Pipe,4, 4, OpReservedReadPipe>;
+defm : DemangledNativeBuiltin<"__write_pipe_4", OpenCL_std, Pipe, 4, 4, OpReservedWritePipe>;
+defm : DemangledNativeBuiltin<"__reserve_read_pipe", OpenCL_std, Pipe, 2, 2, OpReserveReadPipePackets>;
+defm : DemangledNativeBuiltin<"__reserve_write_pipe", OpenCL_std, Pipe, 2, 2, OpReserveWritePipePackets>;
+defm : DemangledNativeBuiltin<"__commit_read_pipe", OpenCL_std, Pipe, 2, 2, OpCommitReadPipe>;
+defm : DemangledNativeBuiltin<"__commit_write_pipe", OpenCL_std, Pipe, 2, 2, OpCommitWritePipe>;
+defm : DemangledNativeBuiltin<"is_valid_reserve_id", OpenCL_std, Pipe, 1, 1, OpIsValidReserveId>;
+defm : DemangledNativeBuiltin<"__get_pipe_num_packets_ro", OpenCL_std, Pipe, 1, 1, OpGetNumPipePackets>;
+defm : DemangledNativeBuiltin<"__get_pipe_max_packets_ro", OpenCL_std, Pipe, 1, 1, OpGetMaxPipePackets>;
+defm : DemangledNativeBuiltin<"__get_pipe_num_packets_wo", OpenCL_std, Pipe, 1, 1, OpGetNumPipePackets>;
+defm : DemangledNativeBuiltin<"__get_pipe_max_packets_wo", OpenCL_std, Pipe, 1, 1, OpGetMaxPipePackets>;
+defm : DemangledNativeBuiltin<"__work_group_reserve_read_pipe", OpenCL_std, Pipe, 2, 2, OpGroupReserveReadPipePackets>;
+defm : DemangledNativeBuiltin<"__work_group_reserve_write_pipe", OpenCL_std, Pipe, 2, 2, OpGroupReserveWritePipePackets>;
+defm : DemangledNativeBuiltin<"__work_group_commit_read_pipe", OpenCL_std, Pipe, 2, 2, OpGroupCommitReadPipe>;
+defm : DemangledNativeBuiltin<"__work_group_commit_write_pipe", OpenCL_std, Pipe, 2, 2, OpGroupCommitWritePipe>;
+defm : DemangledNativeBuiltin<"__sub_group_reserve_read_pipe", OpenCL_std, Pipe, 2, 2, OpGroupReserveReadPipePackets>;
+defm : DemangledNativeBuiltin<"__sub_group_reserve_write_pipe", OpenCL_std, Pipe, 2, 2, OpGroupReserveWritePipePackets>;
+defm : DemangledNativeBuiltin<"__sub_group_commit_read_pipe", OpenCL_std, Pipe, 2, 2, OpGroupCommitReadPipe>;
+defm : DemangledNativeBuiltin<"__sub_group_commit_write_pipe", OpenCL_std, Pipe, 2, 2, OpGroupCommitWritePipe>;
+
 // Barrier builtin records:
 defm : DemangledNativeBuiltin<"barrier", OpenCL_std, Barrier, 1, 3, OpControlBarrier>;
 defm : DemangledNativeBuiltin<"work_group_barrier", OpenCL_std, Barrier, 1, 3, OpControlBarrier>;
diff --git a/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp b/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp
index 993de9e..85ea9e1 100644
--- a/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp
@@ -148,7 +148,10 @@ static const std::map<std::string, SPIRV::Extension::Extension, std::less<>>
          SPIRV::Extension::Extension::SPV_KHR_float_controls2},
         {"SPV_INTEL_tensor_float32_conversion",
          SPIRV::Extension::Extension::SPV_INTEL_tensor_float32_conversion},
-        {"SPV_KHR_bfloat16", SPIRV::Extension::Extension::SPV_KHR_bfloat16}};
+        {"SPV_KHR_bfloat16", SPIRV::Extension::Extension::SPV_KHR_bfloat16},
+        {"SPV_EXT_relaxed_printf_string_address_space",
+         SPIRV::Extension::Extension::
+             SPV_EXT_relaxed_printf_string_address_space}};
 
 bool SPIRVExtensionsParser::parse(cl::Option &O, StringRef ArgName,
                                   StringRef ArgValue,
diff --git a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp
index f5a49e2..704edd3 100644
--- a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp
@@ -1909,11 +1909,12 @@ Instruction *SPIRVEmitIntrinsics::visitInsertValueInst(InsertValueInst &I) {
   B.SetInsertPoint(&I);
   SmallVector<Type *, 1> Types = {I.getInsertedValueOperand()->getType()};
   SmallVector<Value *> Args;
-  for (auto &Op : I.operands())
-    if (isa<UndefValue>(Op))
-      Args.push_back(UndefValue::get(B.getInt32Ty()));
-    else
-      Args.push_back(Op);
+  Value *AggregateOp = I.getAggregateOperand();
+  if (isa<UndefValue>(AggregateOp))
+    Args.push_back(UndefValue::get(B.getInt32Ty()));
+  else
+    Args.push_back(AggregateOp);
+  Args.push_back(I.getInsertedValueOperand());
   for (auto &Op : I.indices())
     Args.push_back(B.getInt32(Op));
   Instruction *NewI =
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td
index 496dcba..1723bfb 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td
+++ b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td
@@ -763,7 +763,38 @@ def OpGetDefaultQueue: Op<303, (outs ID:$res), (ins TYPE:$type),
 def OpBuildNDRange: Op<304, (outs ID:$res), (ins TYPE:$type, ID:$GWS, ID:$LWS, ID:$GWO),
                   "$res = OpBuildNDRange $type $GWS $LWS $GWO">;
 
-// TODO: 3.42.23. Pipe Instructions
+// 3.42.23. Pipe Instructions
+
+def OpReadPipe: Op<274, (outs ID:$res), (ins TYPE:$type, ID:$Pipe, ID:$Pointer, ID:$PcktSize, ID:$PcktAlign),
+                  "$res = OpReadPipe $type $Pipe $Pointer $PcktSize $PcktAlign">;
+def OpWritePipe: Op<275, (outs ID:$res), (ins TYPE:$type, ID:$Pipe, ID:$Pointer, ID:$PcktSize, ID:$PcktAlign),
+                  "$res = OpWritePipe $type $Pipe $Pointer $PcktSize $PcktAlign">;
+def OpReservedReadPipe : Op<276, (outs ID:$res), (ins TYPE:$type, ID:$Pipe, ID:$ReserveId, ID:$Index, ID:$Pointer, ID:$PcktSize, ID:$PcktAlign),
+                  "$res = OpReservedReadPipe $type $Pipe $ReserveId $Index $Pointer $PcktSize $PcktAlign">;
+def OpReservedWritePipe : Op<277, (outs ID:$res), (ins TYPE:$type, ID:$Pipe, ID:$ReserveId, ID:$Index, ID:$Pointer, ID:$PcktSize, ID:$PcktAlign), 
+                  "$res = OpReservedWritePipe $type $Pipe $ReserveId $Index $Pointer $PcktSize $PcktAlign">;
+def OpReserveReadPipePackets : Op<278, (outs ID:$res), (ins TYPE:$type, ID:$Pipe, ID:$NumPckts, ID:$PcktSize, ID:$PcktAlign),
+                  "$res = OpReserveReadPipePackets $type $Pipe $NumPckts $PcktSize $PcktAlign">;
+def OpReserveWritePipePackets : Op<279, (outs ID:$res), (ins TYPE:$type, ID:$Pipe, ID:$NumPckts, ID:$PcktSize, ID:$PcktAlign),
+                  "$res = OpReserveWritePipePackets $type $Pipe $NumPckts $PcktSize $PcktAlign">;
+def OpCommitReadPipe : Op<280, (outs), (ins ID:$Pipe, ID:$ReserveId, ID:$PcktSize, ID:$PcktAlign),
+                  "OpCommitReadPipe $Pipe $ReserveId $PcktSize $PcktAlign">;
+def OpCommitWritePipe : Op<281, (outs), (ins ID:$Pipe, ID:$ReserveId, ID:$PcktSize, ID:$PcktAlign),
+                  "OpCommitWritePipe $Pipe $ReserveId $PcktSize $PcktAlign">;
+def OpIsValidReserveId : Op<282, (outs ID:$res), (ins TYPE:$type, ID:$ReserveId),
+                  "$res = OpIsValidReserveId $type $ReserveId">;
+def OpGetNumPipePackets : Op<283, (outs ID:$res), (ins TYPE:$type, ID:$Pipe, ID:$PacketSize, ID:$PacketAlign),
+                  "$res = OpGetNumPipePackets $type $Pipe $PacketSize $PacketAlign">;
+def OpGetMaxPipePackets : Op<284, (outs ID:$res), (ins TYPE:$type, ID:$Pipe, ID:$PacketSize, ID:$PacketAlign),
+                  "$res = OpGetMaxPipePackets $type $Pipe $PacketSize $PacketAlign">;
+def OpGroupReserveReadPipePackets : Op<285, (outs ID:$res), (ins TYPE:$type, ID:$Scope, ID:$Pipe, ID:$NumPckts, ID:$PacketSize, ID:$PacketAlign),
+                  "$res = OpGroupReserveReadPipePackets $type $Scope $Pipe $NumPckts $PacketSize $PacketAlign">;
+def OpGroupReserveWritePipePackets : Op<286, (outs ID:$res), (ins TYPE:$type, ID:$Scope, ID:$Pipe, ID:$NumPckts, ID:$PacketSize, ID:$PacketAlign),
+                  "$res = OpGroupReserveWritePipePackets $type $Scope $Pipe $NumPckts $PacketSize $PacketAlign">;
+def OpGroupCommitReadPipe : Op<287, (outs), (ins ID:$Scope, ID:$Pipe, ID:$ReserveId, ID:$PacketSize, ID:$PacketAlign),
+                  "OpGroupCommitReadPipe $Scope $Pipe $ReserveId $PacketSize $PacketAlign">;
+def OpGroupCommitWritePipe : Op<288, (outs), (ins ID:$Scope, ID:$Pipe, ID:$ReserveId, ID:$PacketSize, ID:$PacketAlign),
+                  "OpGroupCommitWritePipe $Scope $Pipe $ReserveId $PacketSize $PacketAlign">;
 
 // 3.42.24. Non-Uniform Instructions
 
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
index a7b2179..1aadd9d 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
@@ -197,6 +197,8 @@ private:
 
   bool selectOverflowArith(Register ResVReg, const SPIRVType *ResType,
                            MachineInstr &I, unsigned Opcode) const;
+  bool selectDebugTrap(Register ResVReg, const SPIRVType *ResType,
+                       MachineInstr &I) const;
 
   bool selectIntegerDot(Register ResVReg, const SPIRVType *ResType,
                         MachineInstr &I, bool Signed) const;
@@ -312,7 +314,8 @@ private:
                                 MachineInstr &I) const;
   bool selectModf(Register ResVReg, const SPIRVType *ResType,
                   MachineInstr &I) const;
-
+  bool selectFrexp(Register ResVReg, const SPIRVType *ResType,
+                   MachineInstr &I) const;
   // Utilities
   std::pair<Register, bool>
   buildI32Constant(uint32_t Val, MachineInstr &I,
@@ -833,6 +836,9 @@ bool SPIRVInstructionSelector::spvSelect(Register ResVReg,
   case TargetOpcode::G_USUBSAT:
     return selectExtInst(ResVReg, ResType, I, CL::u_sub_sat);
 
+  case TargetOpcode::G_FFREXP:
+    return selectFrexp(ResVReg, ResType, I);
+
   case TargetOpcode::G_UADDO:
     return selectOverflowArith(ResVReg, ResType, I,
                                ResType->getOpcode() == SPIRV::OpTypeVector
@@ -999,16 +1005,26 @@ bool SPIRVInstructionSelector::spvSelect(Register ResVReg,
   // represent code after lowering or intrinsics which are not implemented but
   // should not crash when found in a customer's LLVM IR input.
   case TargetOpcode::G_TRAP:
-  case TargetOpcode::G_DEBUGTRAP:
   case TargetOpcode::G_UBSANTRAP:
   case TargetOpcode::DBG_LABEL:
     return true;
+  case TargetOpcode::G_DEBUGTRAP:
+    return selectDebugTrap(ResVReg, ResType, I);
 
   default:
     return false;
   }
 }
 
+bool SPIRVInstructionSelector::selectDebugTrap(Register ResVReg,
+                                               const SPIRVType *ResType,
+                                               MachineInstr &I) const {
+  unsigned Opcode = SPIRV::OpNop;
+  MachineBasicBlock &BB = *I.getParent();
+  return BuildMI(BB, I, I.getDebugLoc(), TII.get(Opcode))
+      .constrainAllUses(TII, TRI, RBI);
+}
+
 bool SPIRVInstructionSelector::selectExtInst(Register ResVReg,
                                              const SPIRVType *ResType,
                                              MachineInstr &I,
@@ -1107,6 +1123,53 @@ bool SPIRVInstructionSelector::selectExtInstForLRound(
   return false;
 }
 
+bool SPIRVInstructionSelector::selectFrexp(Register ResVReg,
+                                           const SPIRVType *ResType,
+                                           MachineInstr &I) const {
+  ExtInstList ExtInsts = {{SPIRV::InstructionSet::OpenCL_std, CL::frexp},
+                          {SPIRV::InstructionSet::GLSL_std_450, GL::Frexp}};
+  for (const auto &Ex : ExtInsts) {
+    SPIRV::InstructionSet::InstructionSet Set = Ex.first;
+    uint32_t Opcode = Ex.second;
+    if (!STI.canUseExtInstSet(Set))
+      continue;
+
+    MachineIRBuilder MIRBuilder(I);
+    SPIRVType *PointeeTy = GR.getSPIRVTypeForVReg(I.getOperand(1).getReg());
+    const SPIRVType *PointerType = GR.getOrCreateSPIRVPointerType(
+        PointeeTy, MIRBuilder, SPIRV::StorageClass::Function);
+    Register PointerVReg =
+        createVirtualRegister(PointerType, &GR, MRI, MRI->getMF());
+
+    auto It = getOpVariableMBBIt(I);
+    auto MIB = BuildMI(*It->getParent(), It, It->getDebugLoc(),
+                       TII.get(SPIRV::OpVariable))
+                   .addDef(PointerVReg)
+                   .addUse(GR.getSPIRVTypeID(PointerType))
+                   .addImm(static_cast<uint32_t>(SPIRV::StorageClass::Function))
+                   .constrainAllUses(TII, TRI, RBI);
+
+    MIB = MIB &
+          BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(SPIRV::OpExtInst))
+              .addDef(ResVReg)
+              .addUse(GR.getSPIRVTypeID(ResType))
+              .addImm(static_cast<uint32_t>(Ex.first))
+              .addImm(Opcode)
+              .add(I.getOperand(2))
+              .addUse(PointerVReg)
+              .constrainAllUses(TII, TRI, RBI);
+
+    MIB = MIB &
+          BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(SPIRV::OpLoad))
+              .addDef(I.getOperand(1).getReg())
+              .addUse(GR.getSPIRVTypeID(PointeeTy))
+              .addUse(PointerVReg)
+              .constrainAllUses(TII, TRI, RBI);
+    return MIB;
+  }
+  return false;
+}
+
 bool SPIRVInstructionSelector::selectOpWithSrcs(Register ResVReg,
                                                 const SPIRVType *ResType,
                                                 MachineInstr &I,
diff --git a/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp b/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp
index 27bb54c..db85e33 100644
--- a/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp
@@ -290,6 +290,9 @@ SPIRVLegalizerInfo::SPIRVLegalizerInfo(const SPIRVSubtarget &ST) {
   // Control-flow. In some cases (e.g. constants) s1 may be promoted to s32.
   getActionDefinitionsBuilder(G_BRCOND).legalFor({s1, s32});
 
+  getActionDefinitionsBuilder(G_FFREXP).legalForCartesianProduct(
+      allFloatScalarsAndVectors, {s32, v2s32, v3s32, v4s32, v8s32, v16s32});
+
   // TODO: Review the target OpenCL and GLSL Extended Instruction Set specs to
   // tighten these requirements. Many of these math functions are only legal on
   // specific bitwidths, so they are not selectable for
@@ -584,7 +587,8 @@ bool SPIRVLegalizerInfo::legalizeIsFPClass(
   }
 
   if (FPClassTest PartialCheck = Mask & fcNan) {
-    auto InfWithQnanBitC = buildSPIRVConstant(IntTy, Inf | QNaNBitMask);
+    auto InfWithQnanBitC =
+        buildSPIRVConstant(IntTy, std::move(Inf) | QNaNBitMask);
     if (PartialCheck == fcNan) {
       // isnan(V) ==> abs(V) u> int(inf)
       appendToRes(
@@ -610,7 +614,7 @@ bool SPIRVLegalizerInfo::legalizeIsFPClass(
     APInt ExpLSB = ExpMask & ~(ExpMask.shl(1));
     auto ExpMinusOne = assignSPIRVTy(
         MIRBuilder.buildSub(IntTy, Abs, buildSPIRVConstant(IntTy, ExpLSB)));
-    APInt MaxExpMinusOne = ExpMask - ExpLSB;
+    APInt MaxExpMinusOne = std::move(ExpMask) - ExpLSB;
     auto NormalRes = assignSPIRVTy(
         MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_ULT, DstTy, ExpMinusOne,
                              buildSPIRVConstant(IntTy, MaxExpMinusOne)));
diff --git a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
index a95f393..bc159d5 100644
--- a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
@@ -1222,6 +1222,31 @@ static void AddDotProductRequirements(const MachineInstr &MI,
   }
 }
 
+void addPrintfRequirements(const MachineInstr &MI,
+                           SPIRV::RequirementHandler &Reqs,
+                           const SPIRVSubtarget &ST) {
+  SPIRVGlobalRegistry *GR = ST.getSPIRVGlobalRegistry();
+  const SPIRVType *PtrType = GR->getSPIRVTypeForVReg(MI.getOperand(4).getReg());
+  if (PtrType) {
+    MachineOperand ASOp = PtrType->getOperand(1);
+    if (ASOp.isImm()) {
+      unsigned AddrSpace = ASOp.getImm();
+      if (AddrSpace != SPIRV::StorageClass::UniformConstant) {
+        if (!ST.canUseExtension(
+                SPIRV::Extension::
+                    SPV_EXT_relaxed_printf_string_address_space)) {
+          report_fatal_error("SPV_EXT_relaxed_printf_string_address_space is "
+                             "required because printf uses a format string not "
+                             "in constant address space.",
+                             false);
+        }
+        Reqs.addExtension(
+            SPIRV::Extension::SPV_EXT_relaxed_printf_string_address_space);
+      }
+    }
+  }
+}
+
 static bool isBFloat16Type(const SPIRVType *TypeDef) {
   return TypeDef && TypeDef->getNumOperands() == 3 &&
          TypeDef->getOpcode() == SPIRV::OpTypeFloat &&
@@ -1230,8 +1255,9 @@ static bool isBFloat16Type(const SPIRVType *TypeDef) {
 }
 
 void addInstrRequirements(const MachineInstr &MI,
-                          SPIRV::RequirementHandler &Reqs,
+                          SPIRV::ModuleAnalysisInfo &MAI,
                           const SPIRVSubtarget &ST) {
+  SPIRV::RequirementHandler &Reqs = MAI.Reqs;
   switch (MI.getOpcode()) {
   case SPIRV::OpMemoryModel: {
     int64_t Addr = MI.getOperand(0).getImm();
@@ -1321,6 +1347,12 @@ void addInstrRequirements(const MachineInstr &MI,
         static_cast<int64_t>(
             SPIRV::InstructionSet::NonSemantic_Shader_DebugInfo_100)) {
       Reqs.addExtension(SPIRV::Extension::SPV_KHR_non_semantic_info);
+      break;
+    }
+    if (MI.getOperand(3).getImm() ==
+        static_cast<int64_t>(SPIRV::OpenCLExtInst::printf)) {
+      addPrintfRequirements(MI, Reqs, ST);
+      break;
     }
     break;
   }
@@ -1781,15 +1813,45 @@ void addInstrRequirements(const MachineInstr &MI,
     break;
   case SPIRV::OpConvertHandleToImageINTEL:
   case SPIRV::OpConvertHandleToSamplerINTEL:
-  case SPIRV::OpConvertHandleToSampledImageINTEL:
+  case SPIRV::OpConvertHandleToSampledImageINTEL: {
     if (!ST.canUseExtension(SPIRV::Extension::SPV_INTEL_bindless_images))
       report_fatal_error("OpConvertHandleTo[Image/Sampler/SampledImage]INTEL "
                          "instructions require the following SPIR-V extension: "
                          "SPV_INTEL_bindless_images",
                          false);
+    SPIRVGlobalRegistry *GR = ST.getSPIRVGlobalRegistry();
+    SPIRV::AddressingModel::AddressingModel AddrModel = MAI.Addr;
+    SPIRVType *TyDef = GR->getSPIRVTypeForVReg(MI.getOperand(1).getReg());
+    if (MI.getOpcode() == SPIRV::OpConvertHandleToImageINTEL &&
+        TyDef->getOpcode() != SPIRV::OpTypeImage) {
+      report_fatal_error("Incorrect return type for the instruction "
+                         "OpConvertHandleToImageINTEL",
+                         false);
+    } else if (MI.getOpcode() == SPIRV::OpConvertHandleToSamplerINTEL &&
+               TyDef->getOpcode() != SPIRV::OpTypeSampler) {
+      report_fatal_error("Incorrect return type for the instruction "
+                         "OpConvertHandleToSamplerINTEL",
+                         false);
+    } else if (MI.getOpcode() == SPIRV::OpConvertHandleToSampledImageINTEL &&
+               TyDef->getOpcode() != SPIRV::OpTypeSampledImage) {
+      report_fatal_error("Incorrect return type for the instruction "
+                         "OpConvertHandleToSampledImageINTEL",
+                         false);
+    }
+    SPIRVType *SpvTy = GR->getSPIRVTypeForVReg(MI.getOperand(2).getReg());
+    unsigned Bitwidth = GR->getScalarOrVectorBitWidth(SpvTy);
+    if (!(Bitwidth == 32 && AddrModel == SPIRV::AddressingModel::Physical32) &&
+        !(Bitwidth == 64 && AddrModel == SPIRV::AddressingModel::Physical64)) {
+      report_fatal_error(
+          "Parameter value must be a 32-bit scalar in case of "
+          "Physical32 addressing model or a 64-bit scalar in case of "
+          "Physical64 addressing model",
+          false);
+    }
     Reqs.addExtension(SPIRV::Extension::SPV_INTEL_bindless_images);
     Reqs.addCapability(SPIRV::Capability::BindlessImagesINTEL);
     break;
+  }
   case SPIRV::OpSubgroup2DBlockLoadINTEL:
   case SPIRV::OpSubgroup2DBlockLoadTransposeINTEL:
   case SPIRV::OpSubgroup2DBlockLoadTransformINTEL:
@@ -1927,7 +1989,7 @@ static void collectReqs(const Module &M, SPIRV::ModuleAnalysisInfo &MAI,
       continue;
     for (const MachineBasicBlock &MBB : *MF)
       for (const MachineInstr &MI : MBB)
-        addInstrRequirements(MI, MAI.Reqs, ST);
+        addInstrRequirements(MI, MAI, ST);
   }
   // Collect requirements for OpExecutionMode instructions.
   auto Node = M.getNamedMetadata("spirv.ExecutionMode");
diff --git a/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp b/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp
index 2b34f61..4e4e6fb 100644
--- a/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp
@@ -335,6 +335,21 @@ static void lowerFunnelShifts(IntrinsicInst *FSHIntrinsic) {
   FSHIntrinsic->setCalledFunction(FSHFunc);
 }
 
+static void lowerConstrainedFPCmpIntrinsic(
+    ConstrainedFPCmpIntrinsic *ConstrainedCmpIntrinsic,
+    SmallVector<Instruction *> &EraseFromParent) {
+  if (!ConstrainedCmpIntrinsic)
+    return;
+  // Extract the floating-point values being compared
+  Value *LHS = ConstrainedCmpIntrinsic->getArgOperand(0);
+  Value *RHS = ConstrainedCmpIntrinsic->getArgOperand(1);
+  FCmpInst::Predicate Pred = ConstrainedCmpIntrinsic->getPredicate();
+  IRBuilder<> Builder(ConstrainedCmpIntrinsic);
+  Value *FCmp = Builder.CreateFCmp(Pred, LHS, RHS);
+  ConstrainedCmpIntrinsic->replaceAllUsesWith(FCmp);
+  EraseFromParent.push_back(dyn_cast<Instruction>(ConstrainedCmpIntrinsic));
+}
+
 static void lowerExpectAssume(IntrinsicInst *II) {
   // If we cannot use the SPV_KHR_expect_assume extension, then we need to
   // ignore the intrinsic and move on. It should be removed later on by LLVM.
@@ -376,6 +391,7 @@ static bool toSpvLifetimeIntrinsic(IntrinsicInst *II, Intrinsic::ID NewID) {
 bool SPIRVPrepareFunctions::substituteIntrinsicCalls(Function *F) {
   bool Changed = false;
   const SPIRVSubtarget &STI = TM.getSubtarget<SPIRVSubtarget>(*F);
+  SmallVector<Instruction *> EraseFromParent;
   for (BasicBlock &BB : *F) {
     for (Instruction &I : make_early_inc_range(BB)) {
       auto Call = dyn_cast<CallInst>(&I);
@@ -423,9 +439,17 @@ bool SPIRVPrepareFunctions::substituteIntrinsicCalls(Function *F) {
         lowerPtrAnnotation(II);
         Changed = true;
         break;
+      case Intrinsic::experimental_constrained_fcmp:
+      case Intrinsic::experimental_constrained_fcmps:
+        lowerConstrainedFPCmpIntrinsic(dyn_cast<ConstrainedFPCmpIntrinsic>(II),
+                                       EraseFromParent);
+        Changed = true;
+        break;
       }
     }
   }
+  for (auto *I : EraseFromParent)
+    I->eraseFromParent();
   return Changed;
 }
 
diff --git a/llvm/lib/Target/X86/X86FixupInstTuning.cpp b/llvm/lib/Target/X86/X86FixupInstTuning.cpp
index 33dc0a2..a1d4e0b 100644
--- a/llvm/lib/Target/X86/X86FixupInstTuning.cpp
+++ b/llvm/lib/Target/X86/X86FixupInstTuning.cpp
@@ -277,6 +277,22 @@ bool X86FixupInstTuningPass::processInstruction(
     return true;
   };
 
+  // Is ADD(X,X) more efficient than SHL(X,1)?
+  auto ProcessShiftLeftToAdd = [&](unsigned AddOpc) -> bool {
+    if (MI.getOperand(NumOperands - 1).getImm() != 1)
+      return false;
+    if (!NewOpcPreferable(AddOpc, /*ReplaceInTie*/ true))
+      return false;
+    LLVM_DEBUG(dbgs() << "Replacing: " << MI);
+    {
+      MI.setDesc(TII->get(AddOpc));
+      MI.removeOperand(NumOperands - 1);
+      MI.addOperand(MI.getOperand(NumOperands - 2));
+    }
+    LLVM_DEBUG(dbgs() << "     With: " << MI);
+    return false;
+  };
+
   switch (Opc) {
   case X86::BLENDPDrri:
     return ProcessBLENDToMOV(X86::MOVSDrr, 0x3, 0x1);
@@ -563,6 +579,44 @@ bool X86FixupInstTuningPass::processInstruction(
     return ProcessUNPCKPS(X86::VPUNPCKHDQZ256rmkz);
   case X86::VUNPCKHPSZrmkz:
     return ProcessUNPCKPS(X86::VPUNPCKHDQZrmkz);
+
+  case X86::PSLLWri:
+    return ProcessShiftLeftToAdd(X86::PADDWrr);
+  case X86::VPSLLWri:
+    return ProcessShiftLeftToAdd(X86::VPADDWrr);
+  case X86::VPSLLWYri:
+    return ProcessShiftLeftToAdd(X86::VPADDWYrr);
+  case X86::VPSLLWZ128ri:
+    return ProcessShiftLeftToAdd(X86::VPADDWZ128rr);
+  case X86::VPSLLWZ256ri:
+    return ProcessShiftLeftToAdd(X86::VPADDWZ256rr);
+  case X86::VPSLLWZri:
+    return ProcessShiftLeftToAdd(X86::VPADDWZrr);
+  case X86::PSLLDri:
+    return ProcessShiftLeftToAdd(X86::PADDDrr);
+  case X86::VPSLLDri:
+    return ProcessShiftLeftToAdd(X86::VPADDDrr);
+  case X86::VPSLLDYri:
+    return ProcessShiftLeftToAdd(X86::VPADDDYrr);
+  case X86::VPSLLDZ128ri:
+    return ProcessShiftLeftToAdd(X86::VPADDDZ128rr);
+  case X86::VPSLLDZ256ri:
+    return ProcessShiftLeftToAdd(X86::VPADDDZ256rr);
+  case X86::VPSLLDZri:
+    return ProcessShiftLeftToAdd(X86::VPADDDZrr);
+  case X86::PSLLQri:
+    return ProcessShiftLeftToAdd(X86::PADDQrr);
+  case X86::VPSLLQri:
+    return ProcessShiftLeftToAdd(X86::VPADDQrr);
+  case X86::VPSLLQYri:
+    return ProcessShiftLeftToAdd(X86::VPADDQYrr);
+  case X86::VPSLLQZ128ri:
+    return ProcessShiftLeftToAdd(X86::VPADDQZ128rr);
+  case X86::VPSLLQZ256ri:
+    return ProcessShiftLeftToAdd(X86::VPADDQZ256rr);
+  case X86::VPSLLQZri:
+    return ProcessShiftLeftToAdd(X86::VPADDQZrr);
+
   default:
     return false;
   }
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index efeddd7..cd04ff5 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -4456,8 +4456,8 @@ SDValue SplitOpsAndApply(SelectionDAG &DAG, const X86Subtarget &Subtarget,
                          bool AllowAVX512 = true) {
   assert(Subtarget.hasSSE2() && "Target assumed to support at least SSE2");
   unsigned NumSubs = 1;
-  if ((CheckBWI && Subtarget.useBWIRegs()) ||
-      (!CheckBWI && AllowAVX512 && Subtarget.useAVX512Regs())) {
+  if (AllowAVX512 && ((CheckBWI && Subtarget.useBWIRegs()) ||
+                      (!CheckBWI && Subtarget.useAVX512Regs()))) {
     if (VT.getSizeInBits() > 512) {
       NumSubs = VT.getSizeInBits() / 512;
       assert((VT.getSizeInBits() % 512) == 0 && "Illegal vector size");
@@ -30313,22 +30313,8 @@ static SDValue LowerShiftByScalarImmediate(SDValue Op, SelectionDAG &DAG,
 
   uint64_t ShiftAmt = APIntShiftAmt.getZExtValue();
 
-  if (supportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode())) {
-    // Hardware support for vector shifts is sparse which makes us scalarize the
-    // vector operations in many cases. Also, on sandybridge ADD is faster than
-    // shl: (shl V, 1) -> (add (freeze V), (freeze V))
-    if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1) {
-      // R may be undef at run-time, but (shl R, 1) must be an even number (LSB
-      // must be 0). (add undef, undef) however can be any value. To make this
-      // safe, we must freeze R to ensure that register allocation uses the same
-      // register for an undefined value. This ensures that the result will
-      // still be even and preserves the original semantics.
-      R = DAG.getFreeze(R);
-      return DAG.getNode(ISD::ADD, dl, VT, R, R);
-    }
-
+  if (supportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
     return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
-  }
 
   // i64 SRA needs to be performed as partial shifts.
   if (((!Subtarget.hasXOP() && VT == MVT::v2i64) ||
@@ -31229,16 +31215,16 @@ static SDValue LowerFunnelShift(SDValue Op, const X86Subtarget &Subtarget,
     unsigned NumElts = VT.getVectorNumElements();
 
     if (Subtarget.hasVBMI2() && EltSizeInBits > 8) {
-      if (IsFSHR)
-        std::swap(Op0, Op1);
 
       if (IsCstSplat) {
+        if (IsFSHR)
+          std::swap(Op0, Op1);
         uint64_t ShiftAmt = APIntShiftAmt.urem(EltSizeInBits);
         SDValue Imm = DAG.getTargetConstant(ShiftAmt, DL, MVT::i8);
         return getAVX512Node(IsFSHR ? X86ISD::VSHRD : X86ISD::VSHLD, DL, VT,
                              {Op0, Op1, Imm}, DAG, Subtarget);
       }
-      return getAVX512Node(IsFSHR ? X86ISD::VSHRDV : X86ISD::VSHLDV, DL, VT,
+      return getAVX512Node(IsFSHR ? ISD::FSHR : ISD::FSHL, DL, VT,
                            {Op0, Op1, Amt}, DAG, Subtarget);
     }
     assert((VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8 ||
@@ -35153,8 +35139,6 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   NODE_NAME_CASE(VALIGN)
   NODE_NAME_CASE(VSHLD)
   NODE_NAME_CASE(VSHRD)
-  NODE_NAME_CASE(VSHLDV)
-  NODE_NAME_CASE(VSHRDV)
   NODE_NAME_CASE(PSHUFD)
   NODE_NAME_CASE(PSHUFHW)
   NODE_NAME_CASE(PSHUFLW)
@@ -45185,6 +45169,7 @@ bool X86TargetLowering::isGuaranteedNotToBeUndefOrPoisonForTargetNode(
   case X86ISD::Wrapper:
   case X86ISD::WrapperRIP:
     return true;
+  case X86ISD::INSERTPS:
   case X86ISD::BLENDI:
   case X86ISD::PSHUFB:
   case X86ISD::PSHUFD:
@@ -45255,6 +45240,7 @@ bool X86TargetLowering::canCreateUndefOrPoisonForTargetNode(
   case X86ISD::BLENDV:
     return false;
   // SSE target shuffles.
+  case X86ISD::INSERTPS:
   case X86ISD::PSHUFB:
   case X86ISD::PSHUFD:
   case X86ISD::UNPCKL:
@@ -46211,7 +46197,7 @@ static SDValue createVPDPBUSD(SelectionDAG &DAG, SDValue LHS, SDValue RHS,
   SDValue Zero = DAG.getConstant(0, DL, DpVT);
 
   return SplitOpsAndApply(DAG, Subtarget, DL, DpVT, {Zero, DpOp0, DpOp1},
-                          DpBuilder, false);
+                          DpBuilder, /*CheckBWI=*/false, Subtarget.hasVNNI());
 }
 
 // Create a PSADBW given two sources representable as zexts of vXi8.
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index 8ab8c66..b55556a 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -471,8 +471,7 @@ namespace llvm {
     // VBMI2 Concat & Shift.
     VSHLD,
     VSHRD,
-    VSHLDV,
-    VSHRDV,
+
     // Shuffle Packed Values at 128-bit granularity.
     SHUF128,
     MOVDDUP,
diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td
index b8f2999..564810c 100644
--- a/llvm/lib/Target/X86/X86InstrAVX512.td
+++ b/llvm/lib/Target/X86/X86InstrAVX512.td
@@ -3238,6 +3238,7 @@ multiclass avx512_load<bits<8> opc, string OpcodeStr, string Name,
                                           (_.VT _.RC:$src1),
                                           (_.VT _.RC:$src0))))], _.ExeDomain>,
                        EVEX, EVEX_K, Sched<[Sched.RR]>;
+    let mayLoad = 1, canFoldAsLoad = 1 in
     def rmk : AVX512PI<opc, MRMSrcMem, (outs _.RC:$dst),
                      (ins _.RC:$src0, _.KRCWM:$mask, _.MemOp:$src1),
                      !strconcat(OpcodeStr, "\t{$src1, ${dst} {${mask}}|",
@@ -3248,6 +3249,7 @@ multiclass avx512_load<bits<8> opc, string OpcodeStr, string Name,
                            (_.VT _.RC:$src0))))], _.ExeDomain>,
                      EVEX, EVEX_K, Sched<[Sched.RM]>;
   }
+  let mayLoad = 1, canFoldAsLoad = 1 in
   def rmkz : AVX512PI<opc, MRMSrcMem, (outs _.RC:$dst),
                   (ins _.KRCWM:$mask, _.MemOp:$src),
                   OpcodeStr #"\t{$src, ${dst} {${mask}} {z}|"#
@@ -12298,72 +12300,76 @@ defm : vpclmulqdq_aliases<"VPCLMULQDQZ256", VR256X, i256mem>;
 // VBMI2
 //===----------------------------------------------------------------------===//
 
-multiclass VBMI2_shift_var_rm<bits<8> Op, string OpStr, SDNode OpNode,
+multiclass VBMI2_shift_var_rm<bits<8> Op, string OpStr, SDNode OpNode, bit SwapLR,
                               X86FoldableSchedWrite sched, X86VectorVTInfo VTI> {
   let Constraints = "$src1 = $dst",
       ExeDomain   = VTI.ExeDomain in {
     defm r:   AVX512_maskable_3src<Op, MRMSrcReg, VTI, (outs VTI.RC:$dst),
                 (ins VTI.RC:$src2, VTI.RC:$src3), OpStr,
                 "$src3, $src2", "$src2, $src3",
-                (VTI.VT (OpNode VTI.RC:$src1, VTI.RC:$src2, VTI.RC:$src3))>,
+                !if(SwapLR,
+                (VTI.VT (OpNode (VTI.VT VTI.RC:$src2), (VTI.VT VTI.RC:$src1), (VTI.VT VTI.RC:$src3))),
+                (VTI.VT (OpNode (VTI.VT VTI.RC:$src1), (VTI.VT VTI.RC:$src2), (VTI.VT VTI.RC:$src3))))>,
                 T8, PD, EVEX, VVVV, Sched<[sched]>;
     defm m:   AVX512_maskable_3src<Op, MRMSrcMem, VTI, (outs VTI.RC:$dst),
                 (ins VTI.RC:$src2, VTI.MemOp:$src3), OpStr,
                 "$src3, $src2", "$src2, $src3",
-                (VTI.VT (OpNode VTI.RC:$src1, VTI.RC:$src2,
-                        (VTI.VT (VTI.LdFrag addr:$src3))))>,
+                !if(SwapLR,
+                (VTI.VT (OpNode (VTI.VT VTI.RC:$src2), (VTI.VT VTI.RC:$src1), (VTI.VT (VTI.LdFrag addr:$src3)))),
+                (VTI.VT (OpNode (VTI.VT VTI.RC:$src1), (VTI.VT VTI.RC:$src2), (VTI.VT (VTI.LdFrag addr:$src3)))))>,
                 T8, PD, EVEX, VVVV,
                 Sched<[sched.Folded, sched.ReadAfterFold]>;
   }
 }
 
-multiclass VBMI2_shift_var_rmb<bits<8> Op, string OpStr, SDNode OpNode,
+multiclass VBMI2_shift_var_rmb<bits<8> Op, string OpStr, SDNode OpNode, bit SwapLR,
                                X86FoldableSchedWrite sched, X86VectorVTInfo VTI>
-         : VBMI2_shift_var_rm<Op, OpStr, OpNode, sched, VTI> {
+         : VBMI2_shift_var_rm<Op, OpStr, OpNode, SwapLR, sched, VTI> {
   let Constraints = "$src1 = $dst",
       ExeDomain   = VTI.ExeDomain in
   defm mb:  AVX512_maskable_3src<Op, MRMSrcMem, VTI, (outs VTI.RC:$dst),
               (ins VTI.RC:$src2, VTI.ScalarMemOp:$src3), OpStr,
               "${src3}"#VTI.BroadcastStr#", $src2",
               "$src2, ${src3}"#VTI.BroadcastStr,
-              (OpNode VTI.RC:$src1, VTI.RC:$src2,
-               (VTI.VT (VTI.BroadcastLdFrag addr:$src3)))>,
+              !if(SwapLR,
+              (OpNode (VTI.VT VTI.RC:$src2), (VTI.VT VTI.RC:$src1), (VTI.VT (VTI.BroadcastLdFrag addr:$src3))),
+              (OpNode (VTI.VT VTI.RC:$src1), (VTI.VT VTI.RC:$src2), (VTI.VT (VTI.BroadcastLdFrag addr:$src3))))>,
               T8, PD, EVEX, VVVV, EVEX_B,
               Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
 
-multiclass VBMI2_shift_var_rm_common<bits<8> Op, string OpStr, SDNode OpNode,
+multiclass VBMI2_shift_var_rm_common<bits<8> Op, string OpStr, SDNode OpNode, bit SwapLR,
                                      X86SchedWriteWidths sched, AVX512VLVectorVTInfo VTI> {
   let Predicates = [HasVBMI2] in
-  defm Z      : VBMI2_shift_var_rm<Op, OpStr, OpNode, sched.ZMM, VTI.info512>,
+  defm Z      : VBMI2_shift_var_rm<Op, OpStr, OpNode, SwapLR, sched.ZMM, VTI.info512>,
                                    EVEX_V512;
   let Predicates = [HasVBMI2, HasVLX] in {
-    defm Z256 : VBMI2_shift_var_rm<Op, OpStr, OpNode, sched.YMM, VTI.info256>,
+    defm Z256 : VBMI2_shift_var_rm<Op, OpStr, OpNode, SwapLR, sched.YMM, VTI.info256>,
                                    EVEX_V256;
-    defm Z128 : VBMI2_shift_var_rm<Op, OpStr, OpNode, sched.XMM, VTI.info128>,
+    defm Z128 : VBMI2_shift_var_rm<Op, OpStr, OpNode, SwapLR, sched.XMM, VTI.info128>,
                                    EVEX_V128;
   }
 }
 
-multiclass VBMI2_shift_var_rmb_common<bits<8> Op, string OpStr, SDNode OpNode,
+multiclass VBMI2_shift_var_rmb_common<bits<8> Op, string OpStr, SDNode OpNode, bit SwapLR,
                                       X86SchedWriteWidths sched, AVX512VLVectorVTInfo VTI> {
   let Predicates = [HasVBMI2] in
-  defm Z      : VBMI2_shift_var_rmb<Op, OpStr, OpNode, sched.ZMM, VTI.info512>,
+  defm Z      : VBMI2_shift_var_rmb<Op, OpStr, OpNode, SwapLR, sched.ZMM, VTI.info512>,
                                     EVEX_V512;
   let Predicates = [HasVBMI2, HasVLX] in {
-    defm Z256 : VBMI2_shift_var_rmb<Op, OpStr, OpNode, sched.YMM, VTI.info256>,
+    defm Z256 : VBMI2_shift_var_rmb<Op, OpStr, OpNode, SwapLR, sched.YMM, VTI.info256>,
                                     EVEX_V256;
-    defm Z128 : VBMI2_shift_var_rmb<Op, OpStr, OpNode, sched.XMM, VTI.info128>,
+    defm Z128 : VBMI2_shift_var_rmb<Op, OpStr, OpNode, SwapLR, sched.XMM, VTI.info128>,
                                     EVEX_V128;
   }
 }
 multiclass VBMI2_shift_var<bits<8> wOp, bits<8> dqOp, string Prefix,
-                           SDNode OpNode, X86SchedWriteWidths sched> {
-  defm W : VBMI2_shift_var_rm_common<wOp, Prefix#"w", OpNode, sched,
+                           SDNode OpNode, bit SwapLR, X86SchedWriteWidths sched> {
+  defm W : VBMI2_shift_var_rm_common<wOp, Prefix#"w", OpNode, SwapLR, sched,
              avx512vl_i16_info>, REX_W, EVEX_CD8<16, CD8VF>;
-  defm D : VBMI2_shift_var_rmb_common<dqOp, Prefix#"d", OpNode, sched,
+  defm D : VBMI2_shift_var_rmb_common<dqOp, Prefix#"d", OpNode, SwapLR, sched,
              avx512vl_i32_info>, EVEX_CD8<32, CD8VF>;
-  defm Q : VBMI2_shift_var_rmb_common<dqOp, Prefix#"q", OpNode, sched,
+  defm Q : VBMI2_shift_var_rmb_common<dqOp, Prefix#"q", OpNode, SwapLR, sched,
              avx512vl_i64_info>, REX_W, EVEX_CD8<64, CD8VF>;
 }
 
@@ -12379,8 +12385,8 @@ multiclass VBMI2_shift_imm<bits<8> wOp, bits<8> dqOp, string Prefix,
 }
 
 // Concat & Shift
-defm VPSHLDV : VBMI2_shift_var<0x70, 0x71, "vpshldv", X86VShldv, SchedWriteVecIMul>;
-defm VPSHRDV : VBMI2_shift_var<0x72, 0x73, "vpshrdv", X86VShrdv, SchedWriteVecIMul>;
+defm VPSHLDV : VBMI2_shift_var<0x70, 0x71, "vpshldv", fshl, 0, SchedWriteVecIMul>;
+defm VPSHRDV : VBMI2_shift_var<0x72, 0x73, "vpshrdv", fshr, 1, SchedWriteVecIMul>;
 defm VPSHLD  : VBMI2_shift_imm<0x70, 0x71, "vpshld", X86VShld, SchedWriteVecIMul>;
 defm VPSHRD  : VBMI2_shift_imm<0x72, 0x73, "vpshrd", X86VShrd, SchedWriteVecIMul>;
 
diff --git a/llvm/lib/Target/X86/X86InstrArithmetic.td b/llvm/lib/Target/X86/X86InstrArithmetic.td
index b4768590..031fdc1 100644
--- a/llvm/lib/Target/X86/X86InstrArithmetic.td
+++ b/llvm/lib/Target/X86/X86InstrArithmetic.td
@@ -25,18 +25,12 @@ let SchedRW = [WriteLEA] in {
                      [(set GR32:$dst, lea32addr:$src)]>,
                      OpSize32, Requires<[Not64BitMode]>;
 
-  let Predicates = [HasNDD], isCodeGenOnly = 1 in {
-    def LEA64_8r : I<0x8D, MRMSrcMem, (outs GR8:$dst), (ins lea64_8mem:$src),
-                     "lea{b}\t{$src|$dst}, {$dst|$src}",
-                     [(set GR8:$dst, lea64_iaddr:$src)]>,
-                   OpSize16,
-                   Requires<[In64BitMode]>;
-
-    def LEA64_16r : I<0x8D, MRMSrcMem, (outs GR16:$dst), (ins lea64_16mem:$src),
-                      "lea{w}\t{$src|$dst}, {$dst|$src}",
-                      [(set GR16:$dst, lea64_iaddr:$src)]>,
-                    OpSize16,
-                    Requires<[In64BitMode]>;
+  let isCodeGenOnly = 1 in {
+    def LEA64_8r : I<0x8D, MRMSrcMem, (outs GR32:$dst), (ins lea64_8mem:$src),
+                     "lea{l}\t{$src|$dst}, {$dst|$src}", []>, OpSize32;
+
+    def LEA64_16r : I<0x8D, MRMSrcMem, (outs GR32:$dst), (ins lea64_16mem:$src),
+                      "lea{l}\t{$src|$dst}, {$dst|$src}", []>, OpSize32;
   }
 
   def LEA64_32r : I<0x8D, MRMSrcMem, (outs GR32:$dst), (ins lea64_32mem:$src),
@@ -51,6 +45,11 @@ let SchedRW = [WriteLEA] in {
                       [(set GR64:$dst, lea64addr:$src)]>;
 } // SchedRW
 
+let Predicates = [HasNDD] in {
+  def : Pat<(i8 lea64_iaddr:$src), (EXTRACT_SUBREG (LEA64_8r lea64_8mem:$src), sub_8bit)>;
+  def : Pat<(i16 lea64_iaddr:$src), (EXTRACT_SUBREG (LEA64_16r lea64_16mem:$src), sub_16bit)>;
+}
+
 // Pseudo instruction for lea that prevent optimizer from eliminating
 // the instruction.
 let SchedRW = [WriteLEA], isPseudo = true, hasSideEffects = 1 in {
diff --git a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
index 0c20ffe..5321ecf 100644
--- a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
+++ b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
@@ -406,16 +406,6 @@ def X86VAlign  : SDNode<"X86ISD::VALIGN", SDTShuff3OpI>;
 
 def X86VShld   : SDNode<"X86ISD::VSHLD", SDTShuff3OpI>;
 def X86VShrd   : SDNode<"X86ISD::VSHRD", SDTShuff3OpI>;
-def X86VShldv  : SDNode<"X86ISD::VSHLDV",
-                        SDTypeProfile<1, 3, [SDTCisVec<0>,
-                                             SDTCisSameAs<0,1>,
-                                             SDTCisSameAs<0,2>,
-                                             SDTCisSameAs<0,3>]>>;
-def X86VShrdv  : SDNode<"X86ISD::VSHRDV",
-                        SDTypeProfile<1, 3, [SDTCisVec<0>,
-                                             SDTCisSameAs<0,1>,
-                                             SDTCisSameAs<0,2>,
-                                             SDTCisSameAs<0,3>]>>;
 
 def X86Conflict : SDNode<"X86ISD::CONFLICT", SDTIntUnaryOp>;
 
diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index 03ac1d3..1d2cd39 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -8113,6 +8113,39 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
     MachineBasicBlock::iterator InsertPt, MachineInstr &LoadMI,
     LiveIntervals *LIS) const {
 
+  // If LoadMI is a masked load, check MI having the same mask.
+  const MCInstrDesc &MCID = get(LoadMI.getOpcode());
+  unsigned NumOps = MCID.getNumOperands();
+  if (NumOps >= 3) {
+    Register MaskReg;
+    const MachineOperand &Op1 = LoadMI.getOperand(1);
+    const MachineOperand &Op2 = LoadMI.getOperand(2);
+
+    auto IsVKWMClass = [](const TargetRegisterClass *RC) {
+      return RC == &X86::VK2WMRegClass || RC == &X86::VK4WMRegClass ||
+             RC == &X86::VK8WMRegClass || RC == &X86::VK16WMRegClass ||
+             RC == &X86::VK32WMRegClass || RC == &X86::VK64WMRegClass;
+    };
+
+    if (Op1.isReg() && IsVKWMClass(getRegClass(MCID, 1, &RI)))
+      MaskReg = Op1.getReg();
+    else if (Op2.isReg() && IsVKWMClass(getRegClass(MCID, 2, &RI)))
+      MaskReg = Op2.getReg();
+
+    if (MaskReg) {
+      bool HasSameMask = false;
+      for (unsigned I = 1, E = MI.getDesc().getNumOperands(); I < E; ++I) {
+        const MachineOperand &Op = MI.getOperand(I);
+        if (Op.isReg() && Op.getReg() == MaskReg) {
+          HasSameMask = true;
+          break;
+        }
+      }
+      if (!HasSameMask)
+        return nullptr;
+    }
+  }
+
   // TODO: Support the case where LoadMI loads a wide register, but MI
   // only uses a subreg.
   for (auto Op : Ops) {
@@ -8121,7 +8154,6 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
   }
 
   // If loading from a FrameIndex, fold directly from the FrameIndex.
-  unsigned NumOps = LoadMI.getDesc().getNumOperands();
   int FrameIndex;
   if (isLoadFromStackSlot(LoadMI, FrameIndex)) {
     if (isNonFoldablePartialRegisterLoad(LoadMI, MI, MF))
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
index d1ca0a6..59e103cd 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
@@ -880,11 +880,11 @@ Instruction *InstCombinerImpl::foldAddWithConstant(BinaryOperator &Add) {
   // zext(bool) + C -> bool ? C + 1 : C
   if (match(Op0, m_ZExt(m_Value(X))) &&
       X->getType()->getScalarSizeInBits() == 1)
-    return SelectInst::Create(X, InstCombiner::AddOne(Op1C), Op1);
+    return createSelectInst(X, InstCombiner::AddOne(Op1C), Op1);
   // sext(bool) + C -> bool ? C - 1 : C
   if (match(Op0, m_SExt(m_Value(X))) &&
       X->getType()->getScalarSizeInBits() == 1)
-    return SelectInst::Create(X, InstCombiner::SubOne(Op1C), Op1);
+    return createSelectInst(X, InstCombiner::SubOne(Op1C), Op1);
 
   // ~X + C --> (C-1) - X
   if (match(Op0, m_Not(m_Value(X)))) {
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index c13c6cc..cf6d0ec 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -64,6 +64,7 @@
 #include "llvm/Support/KnownBits.h"
 #include "llvm/Support/KnownFPClass.h"
 #include "llvm/Support/MathExtras.h"
+#include "llvm/Support/TypeSize.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/InstCombine/InstCombiner.h"
 #include "llvm/Transforms/Utils/AssumeBundleBuilder.h"
@@ -3781,6 +3782,17 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
             return replaceInstUsesWith(CI, Res);
           }
       }
+
+      // vector.reduce.add.vNiM(splat(%x)) -> mul(%x, N)
+      if (Value *Splat = getSplatValue(Arg)) {
+        ElementCount VecToReduceCount =
+            cast<VectorType>(Arg->getType())->getElementCount();
+        if (VecToReduceCount.isFixed()) {
+          unsigned VectorSize = VecToReduceCount.getFixedValue();
+          return BinaryOperator::CreateMul(
+              Splat, ConstantInt::get(Splat->getType(), VectorSize));
+        }
+      }
     }
     [[fallthrough]];
   }
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
index 7a979c1..4f94aa2 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -23,6 +23,7 @@
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstVisitor.h"
 #include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/ProfDataUtils.h"
 #include "llvm/IR/Value.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/KnownBits.h"
@@ -62,14 +63,14 @@ class LLVM_LIBRARY_VISIBILITY InstCombinerImpl final
       public InstVisitor<InstCombinerImpl, Instruction *> {
 public:
   InstCombinerImpl(InstructionWorklist &Worklist, BuilderTy &Builder,
-                   bool MinimizeSize, AAResults *AA, AssumptionCache &AC,
+                   Function &F, AAResults *AA, AssumptionCache &AC,
                    TargetLibraryInfo &TLI, TargetTransformInfo &TTI,
                    DominatorTree &DT, OptimizationRemarkEmitter &ORE,
                    BlockFrequencyInfo *BFI, BranchProbabilityInfo *BPI,
                    ProfileSummaryInfo *PSI, const DataLayout &DL,
                    ReversePostOrderTraversal<BasicBlock *> &RPOT)
-      : InstCombiner(Worklist, Builder, MinimizeSize, AA, AC, TLI, TTI, DT, ORE,
-                     BFI, BPI, PSI, DL, RPOT) {}
+      : InstCombiner(Worklist, Builder, F, AA, AC, TLI, TTI, DT, ORE, BFI, BPI,
+                     PSI, DL, RPOT) {}
 
   virtual ~InstCombinerImpl() = default;
 
@@ -469,6 +470,17 @@ private:
   Value *simplifyNonNullOperand(Value *V, bool HasDereferenceable,
                                 unsigned Depth = 0);
 
+  SelectInst *createSelectInst(Value *C, Value *S1, Value *S2,
+                               const Twine &NameStr = "",
+                               InsertPosition InsertBefore = nullptr,
+                               Instruction *MDFrom = nullptr) {
+    SelectInst *SI =
+        SelectInst::Create(C, S1, S2, NameStr, InsertBefore, MDFrom);
+    if (!MDFrom)
+      setExplicitlyUnknownBranchWeightsIfProfiled(*SI, F, DEBUG_TYPE);
+    return SI;
+  }
+
 public:
   /// Create and insert the idiom we use to indicate a block is unreachable
   /// without having to rewrite the CFG from within InstCombine.
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp
index 550f095..d457e0c 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp
@@ -1253,7 +1253,7 @@ Instruction *InstCombinerImpl::visitShl(BinaryOperator &I) {
     // shl (zext i1 X), C1 --> select (X, 1 << C1, 0)
     if (match(Op0, m_ZExt(m_Value(X))) && X->getType()->isIntOrIntVectorTy(1)) {
       auto *NewC = Builder.CreateShl(ConstantInt::get(Ty, 1), C1);
-      return SelectInst::Create(X, NewC, ConstantInt::getNullValue(Ty));
+      return createSelectInst(X, NewC, ConstantInt::getNullValue(Ty));
     }
   }
 
diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index f0ddd5c..8fbaf68 100644
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -1735,7 +1735,7 @@ Instruction *InstCombinerImpl::foldBinopOfSextBoolToSelect(BinaryOperator &BO) {
   Constant *Zero = ConstantInt::getNullValue(BO.getType());
   Value *TVal = Builder.CreateBinOp(BO.getOpcode(), Ones, C);
   Value *FVal = Builder.CreateBinOp(BO.getOpcode(), Zero, C);
-  return SelectInst::Create(X, TVal, FVal);
+  return createSelectInst(X, TVal, FVal);
 }
 
 static Value *simplifyOperationIntoSelectOperand(Instruction &I, SelectInst *SI,
@@ -5934,8 +5934,8 @@ static bool combineInstructionsOverFunction(
     LLVM_DEBUG(dbgs() << "\n\nINSTCOMBINE ITERATION #" << Iteration << " on "
                       << F.getName() << "\n");
 
-    InstCombinerImpl IC(Worklist, Builder, F.hasMinSize(), AA, AC, TLI, TTI, DT,
-                        ORE, BFI, BPI, PSI, DL, RPOT);
+    InstCombinerImpl IC(Worklist, Builder, F, AA, AC, TLI, TTI, DT, ORE, BFI,
+                        BPI, PSI, DL, RPOT);
     IC.MaxArraySizeForCombine = MaxArraySize;
     bool MadeChangeInThisIteration = IC.prepareWorklist(F);
     MadeChangeInThisIteration |= IC.run();
diff --git a/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp b/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp
index e5bf2d1..d842275 100644
--- a/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp
+++ b/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp
@@ -35,6 +35,7 @@
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/Regex.h"
+#include "llvm/Support/VirtualFileSystem.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Instrumentation/CFGMST.h"
 #include "llvm/Transforms/Instrumentation/GCOVProfiler.h"
@@ -92,8 +93,10 @@ class GCOVFunction;
 
 class GCOVProfiler {
 public:
-  GCOVProfiler() : GCOVProfiler(GCOVOptions::getDefault()) {}
-  GCOVProfiler(const GCOVOptions &Opts) : Options(Opts) {}
+  GCOVProfiler()
+      : GCOVProfiler(GCOVOptions::getDefault(), *vfs::getRealFileSystem()) {}
+  GCOVProfiler(const GCOVOptions &Opts, vfs::FileSystem &VFS)
+      : Options(Opts), VFS(VFS) {}
   bool
   runOnModule(Module &M, function_ref<BlockFrequencyInfo *(Function &F)> GetBFI,
               function_ref<BranchProbabilityInfo *(Function &F)> GetBPI,
@@ -110,6 +113,7 @@ public:
     os->write_zeros(4 - s.size() % 4);
   }
   void writeBytes(const char *Bytes, int Size) { os->write(Bytes, Size); }
+  vfs::FileSystem &getVirtualFileSystem() const { return VFS; }
 
 private:
   // Create the .gcno files for the Module based on DebugInfo.
@@ -166,6 +170,7 @@ private:
   std::vector<Regex> ExcludeRe;
   DenseSet<const BasicBlock *> ExecBlocks;
   StringMap<bool> InstrumentedFiles;
+  vfs::FileSystem &VFS;
 };
 
 struct BBInfo {
@@ -214,10 +219,10 @@ static StringRef getFunctionName(const DISubprogram *SP) {
 /// Prefer relative paths in the coverage notes. Clang also may split
 /// up absolute paths into a directory and filename component. When
 /// the relative path doesn't exist, reconstruct the absolute path.
-static SmallString<128> getFilename(const DIScope *SP) {
+static SmallString<128> getFilename(const DIScope *SP, vfs::FileSystem &VFS) {
   SmallString<128> Path;
   StringRef RelPath = SP->getFilename();
-  if (sys::fs::exists(RelPath))
+  if (VFS.exists(RelPath))
     Path = RelPath;
   else
     sys::path::append(Path, SP->getDirectory(), SP->getFilename());
@@ -357,7 +362,7 @@ namespace {
 
     void writeOut(uint32_t CfgChecksum) {
       write(GCOV_TAG_FUNCTION);
-      SmallString<128> Filename = getFilename(SP);
+      SmallString<128> Filename = getFilename(SP, P->getVirtualFileSystem());
       uint32_t BlockLen = 3 + wordsOfString(getFunctionName(SP));
       BlockLen += 1 + wordsOfString(Filename) + 4;
 
@@ -455,7 +460,7 @@ bool GCOVProfiler::isFunctionInstrumented(const Function &F) {
   if (FilterRe.empty() && ExcludeRe.empty()) {
     return true;
   }
-  SmallString<128> Filename = getFilename(F.getSubprogram());
+  SmallString<128> Filename = getFilename(F.getSubprogram(), VFS);
   auto It = InstrumentedFiles.find(Filename);
   if (It != InstrumentedFiles.end()) {
     return It->second;
@@ -467,7 +472,7 @@ bool GCOVProfiler::isFunctionInstrumented(const Function &F) {
   // Path can be
   // /usr/lib/gcc/x86_64-linux-gnu/8/../../../../include/c++/8/bits/*.h so for
   // such a case we must get the real_path.
-  if (sys::fs::real_path(Filename, RealPath)) {
+  if (VFS.getRealPath(Filename, RealPath)) {
     // real_path can fail with path like "foo.c".
     RealFilename = Filename;
   } else {
@@ -524,9 +529,10 @@ std::string GCOVProfiler::mangleName(const DICompileUnit *CU,
   SmallString<128> Filename = CU->getFilename();
   sys::path::replace_extension(Filename, Notes ? "gcno" : "gcda");
   StringRef FName = sys::path::filename(Filename);
-  SmallString<128> CurPath;
-  if (sys::fs::current_path(CurPath))
+  ErrorOr<std::string> CWD = VFS.getCurrentWorkingDirectory();
+  if (!CWD)
     return std::string(FName);
+  SmallString<128> CurPath{*CWD};
   sys::path::append(CurPath, FName);
   return std::string(CurPath);
 }
@@ -554,7 +560,7 @@ bool GCOVProfiler::runOnModule(
 PreservedAnalyses GCOVProfilerPass::run(Module &M,
                                         ModuleAnalysisManager &AM) {
 
-  GCOVProfiler Profiler(GCOVOpts);
+  GCOVProfiler Profiler(GCOVOpts, *VFS);
   FunctionAnalysisManager &FAM =
       AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
 
@@ -789,7 +795,7 @@ bool GCOVProfiler::emitProfileNotes(
       // Add the function line number to the lines of the entry block
       // to have a counter for the function definition.
       uint32_t Line = SP->getLine();
-      auto Filename = getFilename(SP);
+      auto Filename = getFilename(SP, VFS);
 
       BranchProbabilityInfo *BPI = GetBPI(F);
       BlockFrequencyInfo *BFI = GetBFI(F);
@@ -881,7 +887,7 @@ bool GCOVProfiler::emitProfileNotes(
           if (SP != getDISubprogram(Scope))
             continue;
 
-          GCOVLines &Lines = Block.getFile(getFilename(Loc->getScope()));
+          GCOVLines &Lines = Block.getFile(getFilename(Loc->getScope(), VFS));
           Lines.addLine(Loc.getLine());
         }
         Line = 0;
diff --git a/llvm/lib/Transforms/Scalar/DropUnnecessaryAssumes.cpp b/llvm/lib/Transforms/Scalar/DropUnnecessaryAssumes.cpp
index c215228..89980d5 100644
--- a/llvm/lib/Transforms/Scalar/DropUnnecessaryAssumes.cpp
+++ b/llvm/lib/Transforms/Scalar/DropUnnecessaryAssumes.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Scalar/DropUnnecessaryAssumes.h"
+#include "llvm/ADT/SetVector.h"
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/IntrinsicInst.h"
@@ -17,13 +18,48 @@ using namespace llvm;
 using namespace llvm::PatternMatch;
 
 static bool affectedValuesAreEphemeral(ArrayRef<Value *> Affected) {
-  // If all the affected uses have only one use (part of the assume), then
-  // the assume does not provide useful information. Note that additional
-  // users may appear as a result of inlining and CSE, so we should only
-  // make this assumption late in the optimization pipeline.
-  // TODO: Handle dead cyclic usages.
-  // TODO: Handle multiple dead assumes on the same value.
-  return all_of(Affected, match_fn(m_OneUse(m_Value())));
+  // Check whether all the uses are ephemeral, i.e. recursively only used
+  // by assumes. In that case, the assume does not provide useful information.
+  // Note that additional users may appear as a result of inlining and CSE,
+  // so we should only make this assumption late in the optimization pipeline.
+  SmallSetVector<Instruction *, 32> Worklist;
+  auto AddUsers = [&](Value *V) {
+    for (User *U : V->users()) {
+      // Bail out if we need to inspect too many users.
+      if (Worklist.size() >= 32)
+        return false;
+      Worklist.insert(cast<Instruction>(U));
+    }
+    return true;
+  };
+
+  for (Value *V : Affected) {
+    // Do not handle assumes on globals for now. The use list for them may
+    // contain uses in other functions.
+    if (!isa<Instruction, Argument>(V))
+      return false;
+
+    if (!AddUsers(V))
+      return false;
+  }
+
+  for (unsigned Idx = 0; Idx < Worklist.size(); ++Idx) {
+    Instruction *I = Worklist[Idx];
+
+    // Use in assume is ephemeral.
+    if (isa<AssumeInst>(I))
+      continue;
+
+    // Use in side-effecting instruction is non-ephemeral.
+    if (I->mayHaveSideEffects() || I->isTerminator())
+      return false;
+
+    // Otherwise, recursively look at the users.
+    if (!AddUsers(I))
+      return false;
+  }
+
+  return true;
 }
 
 PreservedAnalyses
diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index 2d84b4a..216bdf4 100644
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -84,7 +84,6 @@
 #include <cstdint>
 #include <iterator>
 #include <map>
-#include <numeric>
 #include <optional>
 #include <set>
 #include <tuple>
@@ -6356,25 +6355,25 @@ static Value *foldSwitchToSelect(const SwitchCaseResultVectorTy &ResultVector,
     if (DefaultResult) {
       Value *ValueCompare =
           Builder.CreateICmpEQ(Condition, SecondCase, "switch.selectcmp");
-      SelectInst *SelectValueInst = cast<SelectInst>(Builder.CreateSelect(
-          ValueCompare, ResultVector[1].first, DefaultResult, "switch.select"));
-      SelectValue = SelectValueInst;
-      if (HasBranchWeights) {
+      SelectValue = Builder.CreateSelect(ValueCompare, ResultVector[1].first,
+                                         DefaultResult, "switch.select");
+      if (auto *SI = dyn_cast<SelectInst>(SelectValue);
+          SI && HasBranchWeights) {
         // We start with 3 probabilities, where the numerator is the
         // corresponding BranchWeights[i], and the denominator is the sum over
         // BranchWeights. We want the probability and negative probability of
         // Condition == SecondCase.
         assert(BranchWeights.size() == 3);
-        setBranchWeights(SelectValueInst, BranchWeights[2],
+        setBranchWeights(SI, BranchWeights[2],
                          BranchWeights[0] + BranchWeights[1],
                          /*IsExpected=*/false);
       }
     }
     Value *ValueCompare =
         Builder.CreateICmpEQ(Condition, FirstCase, "switch.selectcmp");
-    SelectInst *Ret = cast<SelectInst>(Builder.CreateSelect(
-        ValueCompare, ResultVector[0].first, SelectValue, "switch.select"));
-    if (HasBranchWeights) {
+    Value *Ret = Builder.CreateSelect(ValueCompare, ResultVector[0].first,
+                                      SelectValue, "switch.select");
+    if (auto *SI = dyn_cast<SelectInst>(Ret); SI && HasBranchWeights) {
       // We may have had a DefaultResult. Base the position of the first and
       // second's branch weights accordingly. Also the proability that Condition
       // != FirstCase needs to take that into account.
@@ -6382,7 +6381,7 @@ static Value *foldSwitchToSelect(const SwitchCaseResultVectorTy &ResultVector,
       size_t FirstCasePos = (Condition != nullptr);
       size_t SecondCasePos = FirstCasePos + 1;
       uint32_t DefaultCase = (Condition != nullptr) ? BranchWeights[0] : 0;
-      setBranchWeights(Ret, BranchWeights[FirstCasePos],
+      setBranchWeights(SI, BranchWeights[FirstCasePos],
                        DefaultCase + BranchWeights[SecondCasePos],
                        /*IsExpected=*/false);
     }
@@ -6422,13 +6421,13 @@ static Value *foldSwitchToSelect(const SwitchCaseResultVectorTy &ResultVector,
           Value *And = Builder.CreateAnd(Condition, AndMask);
           Value *Cmp = Builder.CreateICmpEQ(
               And, Constant::getIntegerValue(And->getType(), AndMask));
-          SelectInst *Ret = cast<SelectInst>(
-              Builder.CreateSelect(Cmp, ResultVector[0].first, DefaultResult));
-          if (HasBranchWeights) {
+          Value *Ret =
+              Builder.CreateSelect(Cmp, ResultVector[0].first, DefaultResult);
+          if (auto *SI = dyn_cast<SelectInst>(Ret); SI && HasBranchWeights) {
             // We know there's a Default case. We base the resulting branch
             // weights off its probability.
             assert(BranchWeights.size() >= 2);
-            setBranchWeights(Ret, accumulate(drop_begin(BranchWeights), 0),
+            setBranchWeights(SI, accumulate(drop_begin(BranchWeights), 0),
                              BranchWeights[0], /*IsExpected=*/false);
           }
           return Ret;
@@ -6448,11 +6447,11 @@ static Value *foldSwitchToSelect(const SwitchCaseResultVectorTy &ResultVector,
         Value *And = Builder.CreateAnd(Condition, ~BitMask, "switch.and");
         Value *Cmp = Builder.CreateICmpEQ(
             And, Constant::getNullValue(And->getType()), "switch.selectcmp");
-        SelectInst *Ret = cast<SelectInst>(
-            Builder.CreateSelect(Cmp, ResultVector[0].first, DefaultResult));
-        if (HasBranchWeights) {
+        Value *Ret =
+            Builder.CreateSelect(Cmp, ResultVector[0].first, DefaultResult);
+        if (auto *SI = dyn_cast<SelectInst>(Ret); SI && HasBranchWeights) {
           assert(BranchWeights.size() >= 2);
-          setBranchWeights(Ret, accumulate(drop_begin(BranchWeights), 0),
+          setBranchWeights(SI, accumulate(drop_begin(BranchWeights), 0),
                            BranchWeights[0], /*IsExpected=*/false);
         }
         return Ret;
@@ -6466,11 +6465,11 @@ static Value *foldSwitchToSelect(const SwitchCaseResultVectorTy &ResultVector,
       Value *Cmp2 = Builder.CreateICmpEQ(Condition, CaseValues[1],
                                          "switch.selectcmp.case2");
       Value *Cmp = Builder.CreateOr(Cmp1, Cmp2, "switch.selectcmp");
-      SelectInst *Ret = cast<SelectInst>(
-          Builder.CreateSelect(Cmp, ResultVector[0].first, DefaultResult));
-      if (HasBranchWeights) {
+      Value *Ret =
+          Builder.CreateSelect(Cmp, ResultVector[0].first, DefaultResult);
+      if (auto *SI = dyn_cast<SelectInst>(Ret); SI && HasBranchWeights) {
         assert(BranchWeights.size() >= 2);
-        setBranchWeights(Ret, accumulate(drop_begin(BranchWeights), 0),
+        setBranchWeights(SI, accumulate(drop_begin(BranchWeights), 0),
                          BranchWeights[0], /*IsExpected=*/false);
       }
       return Ret;
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 93a5f22..ab5c9c9 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -2438,8 +2438,9 @@ struct CSEDenseMapInfo {
 
 } // end anonymous namespace
 
-///Perform cse of induction variable instructions.
-static void cse(BasicBlock *BB) {
+/// FIXME: This legacy common-subexpression-elimination routine is scheduled for
+/// removal, in favor of the VPlan-based one.
+static void legacyCSE(BasicBlock *BB) {
   // Perform simple cse.
   SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
   for (Instruction &In : llvm::make_early_inc_range(*BB)) {
@@ -2543,7 +2544,7 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) {
   BasicBlock *HeaderBB = State.CFG.VPBB2IRBB[HeaderVPBB];
 
   // Remove redundant induction instructions.
-  cse(HeaderBB);
+  legacyCSE(HeaderBB);
 }
 
 void InnerLoopVectorizer::fixNonInductionPHIs(VPTransformState &State) {
@@ -3901,7 +3902,8 @@ void LoopVectorizationPlanner::emitInvalidCostRemarks(
       if (VF.isScalar())
         continue;
 
-      VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind);
+      VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind,
+                            *CM.PSE.getSE());
       precomputeCosts(*Plan, VF, CostCtx);
       auto Iter = vp_depth_first_deep(Plan->getVectorLoopRegion()->getEntry());
       for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
@@ -4158,7 +4160,8 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
 
       // Add on other costs that are modelled in VPlan, but not in the legacy
       // cost model.
-      VPCostContext CostCtx(CM.TTI, *CM.TLI, *P, CM, CM.CostKind);
+      VPCostContext CostCtx(CM.TTI, *CM.TLI, *P, CM, CM.CostKind,
+                            *CM.PSE.getSE());
       VPRegionBlock *VectorRegion = P->getVectorLoopRegion();
       assert(VectorRegion && "Expected to have a vector region!");
       for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
@@ -6833,7 +6836,7 @@ LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF,
 
 InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan,
                                                ElementCount VF) const {
-  VPCostContext CostCtx(CM.TTI, *CM.TLI, Plan, CM, CM.CostKind);
+  VPCostContext CostCtx(CM.TTI, *CM.TLI, Plan, CM, CM.CostKind, *PSE.getSE());
   InstructionCost Cost = precomputeCosts(Plan, VF, CostCtx);
 
   // Now compute and add the VPlan-based cost.
@@ -7066,7 +7069,8 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() {
   // simplifications not accounted for in the legacy cost model. If that's the
   // case, don't trigger the assertion, as the extra simplifications may cause a
   // different VF to be picked by the VPlan-based cost model.
-  VPCostContext CostCtx(CM.TTI, *CM.TLI, BestPlan, CM, CM.CostKind);
+  VPCostContext CostCtx(CM.TTI, *CM.TLI, BestPlan, CM, CM.CostKind,
+                        *CM.PSE.getSE());
   precomputeCosts(BestPlan, BestFactor.Width, CostCtx);
   // Verify that the VPlan-based and legacy cost models agree, except for VPlans
   // with early exits and plans with additional VPlan simplifications. The
@@ -8596,7 +8600,8 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
   // TODO: Enable following transform when the EVL-version of extended-reduction
   // and mulacc-reduction are implemented.
   if (!CM.foldTailWithEVL()) {
-    VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind);
+    VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind,
+                          *CM.PSE.getSE());
     VPlanTransforms::runPass(VPlanTransforms::convertToAbstractRecipes, *Plan,
                              CostCtx, Range);
   }
@@ -10053,7 +10058,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
     bool ForceVectorization =
         Hints.getForce() == LoopVectorizeHints::FK_Enabled;
     VPCostContext CostCtx(CM.TTI, *CM.TLI, LVP.getPlanFor(VF.Width), CM,
-                          CM.CostKind);
+                          CM.CostKind, *CM.PSE.getSE());
     if (!ForceVectorization &&
         !isOutsideLoopWorkProfitable(Checks, VF, L, PSE, CostCtx,
                                      LVP.getPlanFor(VF.Width), SEL,
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 065622e..c547662 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -1100,7 +1100,9 @@ class BinOpSameOpcodeHelper {
       // constant + x cannot be -constant - x
       // instead, it should be x - -constant
       if (Pos == 1 ||
-          (FromOpcode == Instruction::Add && ToOpcode == Instruction::Sub))
+          ((FromOpcode == Instruction::Add || FromOpcode == Instruction::Or ||
+            FromOpcode == Instruction::Xor) &&
+           ToOpcode == Instruction::Sub))
         return SmallVector<Value *>({LHS, RHS});
       return SmallVector<Value *>({RHS, LHS});
     }
@@ -1188,6 +1190,10 @@ public:
         if (CIValue.isAllOnes())
           InterchangeableMask = CanBeAll;
         break;
+      case Instruction::Xor:
+        if (CIValue.isZero())
+          InterchangeableMask = XorBIT | OrBIT | AndBIT | SubBIT | AddBIT;
+        break;
       default:
         if (CIValue.isZero())
           InterchangeableMask = CanBeAll;
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index a1c6f79..728d291 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -845,19 +845,10 @@ InstructionCost VPRegionBlock::cost(ElementCount VF, VPCostContext &Ctx) {
   if (VF.isScalable())
     return InstructionCost::getInvalid();
 
-  // First compute the cost of the conditionally executed recipes, followed by
-  // account for the branching cost, except if the mask is a header mask or
-  // uniform condition.
-  using namespace llvm::VPlanPatternMatch;
+  // Compute and return the cost of the conditionally executed recipes.
+  assert(VF.isVector() && "Can only compute vector cost at the moment.");
   VPBasicBlock *Then = cast<VPBasicBlock>(getEntry()->getSuccessors()[0]);
-  InstructionCost ThenCost = Then->cost(VF, Ctx);
-
-  // For the scalar case, we may not always execute the original predicated
-  // block, Thus, scale the block's cost by the probability of executing it.
-  if (VF.isScalar())
-    return ThenCost / getPredBlockCostDivisor(Ctx.CostKind);
-
-  return ThenCost;
+  return Then->cost(VF, Ctx);
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
@@ -1759,7 +1750,8 @@ VPCostContext::getOperandInfo(VPValue *V) const {
 }
 
 InstructionCost VPCostContext::getScalarizationOverhead(
-    Type *ResultTy, ArrayRef<const VPValue *> Operands, ElementCount VF) {
+    Type *ResultTy, ArrayRef<const VPValue *> Operands, ElementCount VF,
+    bool AlwaysIncludeReplicatingR) {
   if (VF.isScalar())
     return 0;
 
@@ -1779,7 +1771,9 @@ InstructionCost VPCostContext::getScalarizationOverhead(
   SmallPtrSet<const VPValue *, 4> UniqueOperands;
   SmallVector<Type *> Tys;
   for (auto *Op : Operands) {
-    if (Op->isLiveIn() || isa<VPReplicateRecipe, VPPredInstPHIRecipe>(Op) ||
+    if (Op->isLiveIn() ||
+        (!AlwaysIncludeReplicatingR &&
+         isa<VPReplicateRecipe, VPPredInstPHIRecipe>(Op)) ||
         !UniqueOperands.insert(Op).second)
       continue;
     Tys.push_back(toVectorizedTy(Types.inferScalarType(Op), VF));
diff --git a/llvm/lib/Transforms/Vectorize/VPlanHelpers.h b/llvm/lib/Transforms/Vectorize/VPlanHelpers.h
index fe59774..2a8baec 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanHelpers.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanHelpers.h
@@ -349,12 +349,14 @@ struct VPCostContext {
   LoopVectorizationCostModel &CM;
   SmallPtrSet<Instruction *, 8> SkipCostComputation;
   TargetTransformInfo::TargetCostKind CostKind;
+  ScalarEvolution &SE;
 
   VPCostContext(const TargetTransformInfo &TTI, const TargetLibraryInfo &TLI,
                 const VPlan &Plan, LoopVectorizationCostModel &CM,
-                TargetTransformInfo::TargetCostKind CostKind)
+                TargetTransformInfo::TargetCostKind CostKind,
+                ScalarEvolution &SE)
       : TTI(TTI), TLI(TLI), Types(Plan), LLVMCtx(Plan.getContext()), CM(CM),
-        CostKind(CostKind) {}
+        CostKind(CostKind), SE(SE) {}
 
   /// Return the cost for \p UI with \p VF using the legacy cost model as
   /// fallback until computing the cost of all recipes migrates to VPlan.
@@ -374,10 +376,12 @@ struct VPCostContext {
 
   /// Estimate the overhead of scalarizing a recipe with result type \p ResultTy
   /// and \p Operands with \p VF. This is a convenience wrapper for the
-  /// type-based getScalarizationOverhead API.
-  InstructionCost getScalarizationOverhead(Type *ResultTy,
-                                           ArrayRef<const VPValue *> Operands,
-                                           ElementCount VF);
+  /// type-based getScalarizationOverhead API. If \p AlwaysIncludeReplicatingR
+  /// is true, always compute the cost of scalarizing replicating operands.
+  InstructionCost
+  getScalarizationOverhead(Type *ResultTy, ArrayRef<const VPValue *> Operands,
+                           ElementCount VF,
+                           bool AlwaysIncludeReplicatingR = false);
 };
 
 /// This class can be used to assign names to VPValues. For VPValues without
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index cf5e6bf..b5e30cb 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -3069,6 +3069,61 @@ bool VPReplicateRecipe::shouldPack() const {
   });
 }
 
+/// Returns true if \p Ptr is a pointer computation for which the legacy cost
+/// model computes a SCEV expression when computing the address cost.
+static bool shouldUseAddressAccessSCEV(const VPValue *Ptr) {
+  auto *PtrR = Ptr->getDefiningRecipe();
+  if (!PtrR || !((isa<VPReplicateRecipe>(PtrR) &&
+                  cast<VPReplicateRecipe>(PtrR)->getOpcode() ==
+                      Instruction::GetElementPtr) ||
+                 isa<VPWidenGEPRecipe>(PtrR)))
+    return false;
+
+  // We are looking for a GEP where all indices are either loop invariant or
+  // inductions.
+  for (VPValue *Opd : drop_begin(PtrR->operands())) {
+    if (!Opd->isDefinedOutsideLoopRegions() &&
+        !isa<VPScalarIVStepsRecipe, VPWidenIntOrFpInductionRecipe>(Opd))
+      return false;
+  }
+
+  return true;
+}
+
+/// Returns true if \p V is used as part of the address of another load or
+/// store.
+static bool isUsedByLoadStoreAddress(const VPUser *V) {
+  SmallPtrSet<const VPUser *, 4> Seen;
+  SmallVector<const VPUser *> WorkList = {V};
+
+  while (!WorkList.empty()) {
+    auto *Cur = dyn_cast<VPSingleDefRecipe>(WorkList.pop_back_val());
+    if (!Cur || !Seen.insert(Cur).second)
+      continue;
+
+    for (VPUser *U : Cur->users()) {
+      if (auto *InterleaveR = dyn_cast<VPInterleaveBase>(U))
+        if (InterleaveR->getAddr() == Cur)
+          return true;
+      if (auto *RepR = dyn_cast<VPReplicateRecipe>(U)) {
+        if (RepR->getOpcode() == Instruction::Load &&
+            RepR->getOperand(0) == Cur)
+          return true;
+        if (RepR->getOpcode() == Instruction::Store &&
+            RepR->getOperand(1) == Cur)
+          return true;
+      }
+      if (auto *MemR = dyn_cast<VPWidenMemoryRecipe>(U)) {
+        if (MemR->getAddr() == Cur && MemR->isConsecutive())
+          return true;
+      }
+    }
+
+    append_range(WorkList, cast<VPSingleDefRecipe>(Cur)->users());
+  }
+  return false;
+}
+
 InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,
                                                VPCostContext &Ctx) const {
   Instruction *UI = cast<Instruction>(getUnderlyingValue());
@@ -3176,21 +3231,58 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,
   }
   case Instruction::Load:
   case Instruction::Store: {
-    if (isSingleScalar()) {
-      bool IsLoad = UI->getOpcode() == Instruction::Load;
-      Type *ValTy = Ctx.Types.inferScalarType(IsLoad ? this : getOperand(0));
-      Type *ScalarPtrTy = Ctx.Types.inferScalarType(getOperand(IsLoad ? 0 : 1));
-      const Align Alignment = getLoadStoreAlignment(UI);
-      unsigned AS = getLoadStoreAddressSpace(UI);
-      TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(UI->getOperand(0));
-      InstructionCost ScalarMemOpCost = Ctx.TTI.getMemoryOpCost(
-          UI->getOpcode(), ValTy, Alignment, AS, Ctx.CostKind, OpInfo, UI);
-      return ScalarMemOpCost + Ctx.TTI.getAddressComputationCost(
-                                   ScalarPtrTy, nullptr, nullptr, Ctx.CostKind);
-    }
+    if (VF.isScalable() && !isSingleScalar())
+      return InstructionCost::getInvalid();
+
     // TODO: See getMemInstScalarizationCost for how to handle replicating and
     // predicated cases.
-    break;
+    const VPRegionBlock *ParentRegion = getParent()->getParent();
+    if (ParentRegion && ParentRegion->isReplicator())
+      break;
+
+    bool IsLoad = UI->getOpcode() == Instruction::Load;
+    const VPValue *PtrOp = getOperand(!IsLoad);
+    // TODO: Handle cases where we need to pass a SCEV to
+    // getAddressComputationCost.
+    if (shouldUseAddressAccessSCEV(PtrOp))
+      break;
+
+    Type *ValTy = Ctx.Types.inferScalarType(IsLoad ? this : getOperand(0));
+    Type *ScalarPtrTy = Ctx.Types.inferScalarType(PtrOp);
+    const Align Alignment = getLoadStoreAlignment(UI);
+    unsigned AS = getLoadStoreAddressSpace(UI);
+    TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(UI->getOperand(0));
+    InstructionCost ScalarMemOpCost = Ctx.TTI.getMemoryOpCost(
+        UI->getOpcode(), ValTy, Alignment, AS, Ctx.CostKind, OpInfo);
+
+    Type *PtrTy = isSingleScalar() ? ScalarPtrTy : toVectorTy(ScalarPtrTy, VF);
+
+    InstructionCost ScalarCost =
+        ScalarMemOpCost + Ctx.TTI.getAddressComputationCost(
+                              PtrTy, &Ctx.SE, nullptr, Ctx.CostKind);
+    if (isSingleScalar())
+      return ScalarCost;
+
+    SmallVector<const VPValue *> OpsToScalarize;
+    Type *ResultTy = Type::getVoidTy(PtrTy->getContext());
+    // Set ResultTy and OpsToScalarize, if scalarization is needed. Currently we
+    // don't assign scalarization overhead in general, if the target prefers
+    // vectorized addressing or the loaded value is used as part of an address
+    // of another load or store.
+    bool PreferVectorizedAddressing = Ctx.TTI.prefersVectorizedAddressing();
+    if (PreferVectorizedAddressing || !isUsedByLoadStoreAddress(this)) {
+      bool EfficientVectorLoadStore =
+          Ctx.TTI.supportsEfficientVectorElementLoadStore();
+      if (!(IsLoad && !PreferVectorizedAddressing) &&
+          !(!IsLoad && EfficientVectorLoadStore))
+        append_range(OpsToScalarize, operands());
+
+      if (!EfficientVectorLoadStore)
+        ResultTy = Ctx.Types.inferScalarType(this);
+    }
+
+    return (ScalarCost * VF.getFixedValue()) +
+           Ctx.getScalarizationOverhead(ResultTy, OpsToScalarize, VF, true);
   }
   }
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 58fab8f..5252e1f 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -2853,6 +2853,7 @@ void VPlanTransforms::replaceSymbolicStrides(
     return R->getParent()->getParent() ||
            R->getParent() == Plan.getVectorLoopRegion()->getSinglePredecessor();
   };
+  ValueToSCEVMapTy RewriteMap;
   for (const SCEV *Stride : StridesMap.values()) {
     using namespace SCEVPatternMatch;
     auto *StrideV = cast<SCEVUnknown>(Stride)->getValue();
@@ -2880,6 +2881,22 @@ void VPlanTransforms::replaceSymbolicStrides(
       VPValue *CI = Plan.getOrAddLiveIn(ConstantInt::get(U->getType(), C));
       StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride);
     }
+    RewriteMap[StrideV] = PSE.getSCEV(StrideV);
+  }
+
+  for (VPRecipeBase &R : *Plan.getEntry()) {
+    auto *ExpSCEV = dyn_cast<VPExpandSCEVRecipe>(&R);
+    if (!ExpSCEV)
+      continue;
+    const SCEV *ScevExpr = ExpSCEV->getSCEV();
+    auto *NewSCEV =
+        SCEVParameterRewriter::rewrite(ScevExpr, *PSE.getSE(), RewriteMap);
+    if (NewSCEV != ScevExpr) {
+      VPValue *NewExp = vputils::getOrCreateVPValueForSCEVExpr(Plan, NewSCEV);
+      ExpSCEV->replaceAllUsesWith(NewExp);
+      if (Plan.getTripCount() == ExpSCEV)
+        Plan.resetTripCount(NewExp);
+    }
   }
 }
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
index eac0e70..0599930 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
@@ -13,6 +13,7 @@
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
 
 using namespace llvm;
+using namespace llvm::VPlanPatternMatch;
 
 bool vputils::onlyFirstLaneUsed(const VPValue *Def) {
   return all_of(Def->users(),
@@ -63,7 +64,6 @@ bool vputils::isHeaderMask(const VPValue *V, VPlan &Plan) {
   };
 
   VPValue *A, *B;
-  using namespace VPlanPatternMatch;
 
   if (match(V, m_ActiveLaneMask(m_VPValue(A), m_VPValue(B), m_One())))
     return B == Plan.getTripCount() &&
@@ -90,7 +90,6 @@ const SCEV *vputils::getSCEVExprForVPValue(VPValue *V, ScalarEvolution &SE) {
 }
 
 bool vputils::isUniformAcrossVFsAndUFs(VPValue *V) {
-  using namespace VPlanPatternMatch;
   // Live-ins are uniform.
   if (V->isLiveIn())
     return true;
@@ -159,7 +158,6 @@ std::optional<VPValue *>
 vputils::getRecipesForUncountableExit(VPlan &Plan,
                                       SmallVectorImpl<VPRecipeBase *> &Recipes,
                                       SmallVectorImpl<VPRecipeBase *> &GEPs) {
-  using namespace llvm::VPlanPatternMatch;
   // Given a VPlan like the following (just including the recipes contributing
   // to loop control exiting here, not the actual work), we're looking to match
   // the recipes contributing to the uncountable exit condition comparison
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 0ef933f..32704bd 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -2487,21 +2487,31 @@ bool VectorCombine::foldShuffleOfCastops(Instruction &I) {
   if (!match(&I, m_Shuffle(m_Value(V0), m_Value(V1), m_Mask(OldMask))))
     return false;
 
+  // Check whether this is a binary shuffle.
+  bool IsBinaryShuffle = !isa<UndefValue>(V1);
+
   auto *C0 = dyn_cast<CastInst>(V0);
   auto *C1 = dyn_cast<CastInst>(V1);
-  if (!C0 || !C1)
+  if (!C0 || (IsBinaryShuffle && !C1))
     return false;
 
   Instruction::CastOps Opcode = C0->getOpcode();
-  if (C0->getSrcTy() != C1->getSrcTy())
+
+  // If this is allowed, foldShuffleOfCastops can get stuck in a loop
+  // with foldBitcastOfShuffle. Reject in favor of foldBitcastOfShuffle.
+  if (!IsBinaryShuffle && Opcode == Instruction::BitCast)
     return false;
 
-  // Handle shuffle(zext_nneg(x), sext(y)) -> sext(shuffle(x,y)) folds.
-  if (Opcode != C1->getOpcode()) {
-    if (match(C0, m_SExtLike(m_Value())) && match(C1, m_SExtLike(m_Value())))
-      Opcode = Instruction::SExt;
-    else
+  if (IsBinaryShuffle) {
+    if (C0->getSrcTy() != C1->getSrcTy())
       return false;
+    // Handle shuffle(zext_nneg(x), sext(y)) -> sext(shuffle(x,y)) folds.
+    if (Opcode != C1->getOpcode()) {
+      if (match(C0, m_SExtLike(m_Value())) && match(C1, m_SExtLike(m_Value())))
+        Opcode = Instruction::SExt;
+      else
+        return false;
+    }
   }
 
   auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
@@ -2544,23 +2554,31 @@ bool VectorCombine::foldShuffleOfCastops(Instruction &I) {
   InstructionCost CostC0 =
       TTI.getCastInstrCost(C0->getOpcode(), CastDstTy, CastSrcTy,
                            TTI::CastContextHint::None, CostKind);
-  InstructionCost CostC1 =
-      TTI.getCastInstrCost(C1->getOpcode(), CastDstTy, CastSrcTy,
-                           TTI::CastContextHint::None, CostKind);
-  InstructionCost OldCost = CostC0 + CostC1;
-  OldCost +=
-      TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, ShuffleDstTy,
-                         CastDstTy, OldMask, CostKind, 0, nullptr, {}, &I);
 
-  InstructionCost NewCost =
-      TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, NewShuffleDstTy,
-                         CastSrcTy, NewMask, CostKind);
+  TargetTransformInfo::ShuffleKind ShuffleKind;
+  if (IsBinaryShuffle)
+    ShuffleKind = TargetTransformInfo::SK_PermuteTwoSrc;
+  else
+    ShuffleKind = TargetTransformInfo::SK_PermuteSingleSrc;
+
+  InstructionCost OldCost = CostC0;
+  OldCost += TTI.getShuffleCost(ShuffleKind, ShuffleDstTy, CastDstTy, OldMask,
+                                CostKind, 0, nullptr, {}, &I);
+
+  InstructionCost NewCost = TTI.getShuffleCost(ShuffleKind, NewShuffleDstTy,
+                                               CastSrcTy, NewMask, CostKind);
   NewCost += TTI.getCastInstrCost(Opcode, ShuffleDstTy, NewShuffleDstTy,
                                   TTI::CastContextHint::None, CostKind);
   if (!C0->hasOneUse())
     NewCost += CostC0;
-  if (!C1->hasOneUse())
-    NewCost += CostC1;
+  if (IsBinaryShuffle) {
+    InstructionCost CostC1 =
+        TTI.getCastInstrCost(C1->getOpcode(), CastDstTy, CastSrcTy,
+                             TTI::CastContextHint::None, CostKind);
+    OldCost += CostC1;
+    if (!C1->hasOneUse())
+      NewCost += CostC1;
+  }
 
   LLVM_DEBUG(dbgs() << "Found a shuffle feeding two casts: " << I
                     << "\n  OldCost: " << OldCost << " vs NewCost: " << NewCost
@@ -2568,14 +2586,20 @@ bool VectorCombine::foldShuffleOfCastops(Instruction &I) {
   if (NewCost > OldCost)
     return false;
 
-  Value *Shuf = Builder.CreateShuffleVector(C0->getOperand(0),
-                                            C1->getOperand(0), NewMask);
+  Value *Shuf;
+  if (IsBinaryShuffle)
+    Shuf = Builder.CreateShuffleVector(C0->getOperand(0), C1->getOperand(0),
+                                       NewMask);
+  else
+    Shuf = Builder.CreateShuffleVector(C0->getOperand(0), NewMask);
+
   Value *Cast = Builder.CreateCast(Opcode, Shuf, ShuffleDstTy);
 
   // Intersect flags from the old casts.
   if (auto *NewInst = dyn_cast<Instruction>(Cast)) {
     NewInst->copyIRFlags(C0);
-    NewInst->andIRFlags(C1);
+    if (IsBinaryShuffle)
+      NewInst->andIRFlags(C1);
   }
 
   Worklist.pushValue(Shuf);
@@ -4433,7 +4457,7 @@ bool VectorCombine::shrinkPhiOfShuffles(Instruction &I) {
 
   // Create new mask using difference of the two incoming masks.
   int MaskOffset = NewMask[0u];
-  unsigned Index = (InputNumElements - MaskOffset) % InputNumElements;
+  unsigned Index = (InputNumElements + MaskOffset) % InputNumElements;
   NewMask.clear();
 
   for (unsigned I = 0u; I < InputNumElements; ++I) {
diff --git a/llvm/test/CodeGen/AArch64/bf16-vector-bitcast.ll b/llvm/test/CodeGen/AArch64/bf16-vector-bitcast.ll
index 1c216e7..e371748 100644
--- a/llvm/test/CodeGen/AArch64/bf16-vector-bitcast.ll
+++ b/llvm/test/CodeGen/AArch64/bf16-vector-bitcast.ll
@@ -11,6 +11,16 @@ entry:
   ret <4 x i16> %1
 }
 
+define <4 x half> @v4bf16_to_v4f16(float, <4 x bfloat> %a) nounwind {
+; CHECK-LABEL: v4bf16_to_v4f16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmov d0, d1
+; CHECK-NEXT:    ret
+entry:
+  %1 = bitcast <4 x bfloat> %a to <4 x half>
+  ret <4 x half> %1
+}
+
 define <2 x i32> @v4bf16_to_v2i32(float, <4 x bfloat> %a) nounwind {
 ; CHECK-LABEL: v4bf16_to_v2i32:
 ; CHECK:       // %bb.0: // %entry
@@ -82,6 +92,16 @@ entry:
   ret <4 x bfloat> %1
 }
 
+define <4 x bfloat> @v4f16_to_v4bf16(float, <4 x half> %a) nounwind {
+; CHECK-LABEL: v4f16_to_v4bf16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmov d0, d1
+; CHECK-NEXT:    ret
+entry:
+  %1 = bitcast <4 x half> %a to <4 x bfloat>
+  ret <4 x bfloat> %1
+}
+
 define <4 x bfloat> @v2i32_to_v4bf16(float, <2 x i32> %a) nounwind {
 ; CHECK-LABEL: v2i32_to_v4bf16:
 ; CHECK:       // %bb.0: // %entry
@@ -152,6 +172,16 @@ entry:
   ret <8 x i16> %1
 }
 
+define <8 x half> @v8bf16_to_v8f16(float, <8 x bfloat> %a) nounwind {
+; CHECK-LABEL: v8bf16_to_v8f16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov v0.16b, v1.16b
+; CHECK-NEXT:    ret
+entry:
+  %1 = bitcast <8 x bfloat> %a to <8 x half>
+  ret <8 x half> %1
+}
+
 define <4 x i32> @v8bf16_to_v4i32(float, <8 x bfloat> %a) nounwind {
 ; CHECK-LABEL: v8bf16_to_v4i32:
 ; CHECK:       // %bb.0: // %entry
@@ -202,6 +232,16 @@ entry:
   ret <8 x bfloat> %1
 }
 
+define <8 x bfloat> @v8f16_to_v8bf16(float, <8 x half> %a) nounwind {
+; CHECK-LABEL: v8f16_to_v8bf16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov v0.16b, v1.16b
+; CHECK-NEXT:    ret
+entry:
+  %1 = bitcast <8 x half> %a to <8 x bfloat>
+  ret <8 x bfloat> %1
+}
+
 define <8 x bfloat> @v4i32_to_v8bf16(float, <4 x i32> %a) nounwind {
 ; CHECK-LABEL: v4i32_to_v8bf16:
 ; CHECK:       // %bb.0: // %entry
diff --git a/llvm/test/CodeGen/AArch64/check-sign-bit-before-extension.ll b/llvm/test/CodeGen/AArch64/check-sign-bit-before-extension.ll
index 0960c4c..a56d5b1 100644
--- a/llvm/test/CodeGen/AArch64/check-sign-bit-before-extension.ll
+++ b/llvm/test/CodeGen/AArch64/check-sign-bit-before-extension.ll
@@ -78,9 +78,8 @@ B:
 define i32 @g_i8_sign_extend_inreg(i8 %in, i32 %a, i32 %b) nounwind {
 ; CHECK-LABEL: g_i8_sign_extend_inreg:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    sxtb w8, w0
-; CHECK-NEXT:    cmp w8, #0
-; CHECK-NEXT:    csel w8, w1, w2, mi
+; CHECK-NEXT:    tst w0, #0x80
+; CHECK-NEXT:    csel w8, w1, w2, ne
 ; CHECK-NEXT:    add w0, w8, w0, uxtb
 ; CHECK-NEXT:    ret
 entry:
@@ -100,9 +99,8 @@ B:
 define i32 @g_i16_sign_extend_inreg(i16 %in, i32 %a, i32 %b) nounwind {
 ; CHECK-LABEL: g_i16_sign_extend_inreg:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    sxth w8, w0
-; CHECK-NEXT:    cmp w8, #0
-; CHECK-NEXT:    csel w8, w1, w2, mi
+; CHECK-NEXT:    tst w0, #0x8000
+; CHECK-NEXT:    csel w8, w1, w2, ne
 ; CHECK-NEXT:    add w0, w8, w0, uxth
 ; CHECK-NEXT:    ret
 entry:
@@ -167,10 +165,8 @@ B:
 define i64 @g_i32_sign_extend_i64(i32 %in, i64 %a, i64 %b) nounwind {
 ; CHECK-LABEL: g_i32_sign_extend_i64:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
-; CHECK-NEXT:    sxtw x8, w0
-; CHECK-NEXT:    cmp x8, #0
-; CHECK-NEXT:    csel x8, x1, x2, mi
+; CHECK-NEXT:    tst w0, #0x80000000
+; CHECK-NEXT:    csel x8, x1, x2, ne
 ; CHECK-NEXT:    add x0, x8, w0, uxtw
 ; CHECK-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/AArch64/fp16-v4-instructions.ll b/llvm/test/CodeGen/AArch64/fp16-v4-instructions.ll
index 8bc3497..6233ce7 100644
--- a/llvm/test/CodeGen/AArch64/fp16-v4-instructions.ll
+++ b/llvm/test/CodeGen/AArch64/fp16-v4-instructions.ll
@@ -1,20 +1,30 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc < %s -mtriple=aarch64 -mattr=-fullfp16 | FileCheck %s --check-prefixes=CHECK-COMMON,CHECK-CVT
-; RUN: llc < %s -mtriple=aarch64 -mattr=+fullfp16 | FileCheck %s --check-prefixes=CHECK-COMMON,CHECK-FP16
+; RUN: llc < %s -mtriple=aarch64 -mattr=-fullfp16 | FileCheck %s --check-prefixes=CHECK,CHECK-CVT,CHECK-CVT-SD
+; RUN: llc < %s -mtriple=aarch64 -mattr=+fullfp16 | FileCheck %s --check-prefixes=CHECK,CHECK-FP16,CHECK-FP16-SD
+; RUN: llc < %s -mtriple=aarch64 -mattr=-fullfp16 -global-isel | FileCheck %s --check-prefixes=CHECK,CHECK-CVT,CHECK-CVT-GI
+; RUN: llc < %s -mtriple=aarch64 -mattr=+fullfp16 -global-isel | FileCheck %s --check-prefixes=CHECK,CHECK-FP16,CHECK-FP16-GI
 
 define <4 x half> @add_h(<4 x half> %a, <4 x half> %b) {
-; CHECK-CVT-LABEL: add_h:
-; CHECK-CVT:       // %bb.0: // %entry
-; CHECK-CVT-NEXT:    fcvtl v1.4s, v1.4h
-; CHECK-CVT-NEXT:    fcvtl v0.4s, v0.4h
-; CHECK-CVT-NEXT:    fadd v0.4s, v0.4s, v1.4s
-; CHECK-CVT-NEXT:    fcvtn v0.4h, v0.4s
-; CHECK-CVT-NEXT:    ret
+; CHECK-CVT-SD-LABEL: add_h:
+; CHECK-CVT-SD:       // %bb.0: // %entry
+; CHECK-CVT-SD-NEXT:    fcvtl v1.4s, v1.4h
+; CHECK-CVT-SD-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-CVT-SD-NEXT:    fadd v0.4s, v0.4s, v1.4s
+; CHECK-CVT-SD-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-CVT-SD-NEXT:    ret
 ;
 ; CHECK-FP16-LABEL: add_h:
 ; CHECK-FP16:       // %bb.0: // %entry
 ; CHECK-FP16-NEXT:    fadd v0.4h, v0.4h, v1.4h
 ; CHECK-FP16-NEXT:    ret
+;
+; CHECK-CVT-GI-LABEL: add_h:
+; CHECK-CVT-GI:       // %bb.0: // %entry
+; CHECK-CVT-GI-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-CVT-GI-NEXT:    fcvtl v1.4s, v1.4h
+; CHECK-CVT-GI-NEXT:    fadd v0.4s, v0.4s, v1.4s
+; CHECK-CVT-GI-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-CVT-GI-NEXT:    ret
 entry:
 
   %0 = fadd <4 x half> %a, %b
@@ -22,28 +32,54 @@ entry:
 }
 
 define <4 x half> @build_h4(<4 x half> %a) {
-; CHECK-COMMON-LABEL: build_h4:
-; CHECK-COMMON:       // %bb.0: // %entry
-; CHECK-COMMON-NEXT:    mov w8, #15565 // =0x3ccd
-; CHECK-COMMON-NEXT:    dup v0.4h, w8
-; CHECK-COMMON-NEXT:    ret
+; CHECK-CVT-SD-LABEL: build_h4:
+; CHECK-CVT-SD:       // %bb.0: // %entry
+; CHECK-CVT-SD-NEXT:    mov w8, #15565 // =0x3ccd
+; CHECK-CVT-SD-NEXT:    dup v0.4h, w8
+; CHECK-CVT-SD-NEXT:    ret
+;
+; CHECK-FP16-SD-LABEL: build_h4:
+; CHECK-FP16-SD:       // %bb.0: // %entry
+; CHECK-FP16-SD-NEXT:    mov w8, #15565 // =0x3ccd
+; CHECK-FP16-SD-NEXT:    dup v0.4h, w8
+; CHECK-FP16-SD-NEXT:    ret
+;
+; CHECK-CVT-GI-LABEL: build_h4:
+; CHECK-CVT-GI:       // %bb.0: // %entry
+; CHECK-CVT-GI-NEXT:    adrp x8, .LCPI1_0
+; CHECK-CVT-GI-NEXT:    ldr d0, [x8, :lo12:.LCPI1_0]
+; CHECK-CVT-GI-NEXT:    ret
+;
+; CHECK-FP16-GI-LABEL: build_h4:
+; CHECK-FP16-GI:       // %bb.0: // %entry
+; CHECK-FP16-GI-NEXT:    adrp x8, .LCPI1_0
+; CHECK-FP16-GI-NEXT:    ldr d0, [x8, :lo12:.LCPI1_0]
+; CHECK-FP16-GI-NEXT:    ret
 entry:
   ret <4 x half> <half 0xH3CCD, half 0xH3CCD, half 0xH3CCD, half 0xH3CCD>
 }
 
 define <4 x half> @sub_h(<4 x half> %a, <4 x half> %b) {
-; CHECK-CVT-LABEL: sub_h:
-; CHECK-CVT:       // %bb.0: // %entry
-; CHECK-CVT-NEXT:    fcvtl v1.4s, v1.4h
-; CHECK-CVT-NEXT:    fcvtl v0.4s, v0.4h
-; CHECK-CVT-NEXT:    fsub v0.4s, v0.4s, v1.4s
-; CHECK-CVT-NEXT:    fcvtn v0.4h, v0.4s
-; CHECK-CVT-NEXT:    ret
+; CHECK-CVT-SD-LABEL: sub_h:
+; CHECK-CVT-SD:       // %bb.0: // %entry
+; CHECK-CVT-SD-NEXT:    fcvtl v1.4s, v1.4h
+; CHECK-CVT-SD-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-CVT-SD-NEXT:    fsub v0.4s, v0.4s, v1.4s
+; CHECK-CVT-SD-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-CVT-SD-NEXT:    ret
 ;
 ; CHECK-FP16-LABEL: sub_h:
 ; CHECK-FP16:       // %bb.0: // %entry
 ; CHECK-FP16-NEXT:    fsub v0.4h, v0.4h, v1.4h
 ; CHECK-FP16-NEXT:    ret
+;
+; CHECK-CVT-GI-LABEL: sub_h:
+; CHECK-CVT-GI:       // %bb.0: // %entry
+; CHECK-CVT-GI-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-CVT-GI-NEXT:    fcvtl v1.4s, v1.4h
+; CHECK-CVT-GI-NEXT:    fsub v0.4s, v0.4s, v1.4s
+; CHECK-CVT-GI-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-CVT-GI-NEXT:    ret
 entry:
 
   %0 = fsub <4 x half> %a, %b
@@ -51,18 +87,26 @@ entry:
 }
 
 define <4 x half> @mul_h(<4 x half> %a, <4 x half> %b) {
-; CHECK-CVT-LABEL: mul_h:
-; CHECK-CVT:       // %bb.0: // %entry
-; CHECK-CVT-NEXT:    fcvtl v1.4s, v1.4h
-; CHECK-CVT-NEXT:    fcvtl v0.4s, v0.4h
-; CHECK-CVT-NEXT:    fmul v0.4s, v0.4s, v1.4s
-; CHECK-CVT-NEXT:    fcvtn v0.4h, v0.4s
-; CHECK-CVT-NEXT:    ret
+; CHECK-CVT-SD-LABEL: mul_h:
+; CHECK-CVT-SD:       // %bb.0: // %entry
+; CHECK-CVT-SD-NEXT:    fcvtl v1.4s, v1.4h
+; CHECK-CVT-SD-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-CVT-SD-NEXT:    fmul v0.4s, v0.4s, v1.4s
+; CHECK-CVT-SD-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-CVT-SD-NEXT:    ret
 ;
 ; CHECK-FP16-LABEL: mul_h:
 ; CHECK-FP16:       // %bb.0: // %entry
 ; CHECK-FP16-NEXT:    fmul v0.4h, v0.4h, v1.4h
 ; CHECK-FP16-NEXT:    ret
+;
+; CHECK-CVT-GI-LABEL: mul_h:
+; CHECK-CVT-GI:       // %bb.0: // %entry
+; CHECK-CVT-GI-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-CVT-GI-NEXT:    fcvtl v1.4s, v1.4h
+; CHECK-CVT-GI-NEXT:    fmul v0.4s, v0.4s, v1.4s
+; CHECK-CVT-GI-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-CVT-GI-NEXT:    ret
 entry:
 
   %0 = fmul <4 x half> %a, %b
@@ -70,18 +114,26 @@ entry:
 }
 
 define <4 x half> @div_h(<4 x half> %a, <4 x half> %b) {
-; CHECK-CVT-LABEL: div_h:
-; CHECK-CVT:       // %bb.0: // %entry
-; CHECK-CVT-NEXT:    fcvtl v1.4s, v1.4h
-; CHECK-CVT-NEXT:    fcvtl v0.4s, v0.4h
-; CHECK-CVT-NEXT:    fdiv v0.4s, v0.4s, v1.4s
-; CHECK-CVT-NEXT:    fcvtn v0.4h, v0.4s
-; CHECK-CVT-NEXT:    ret
+; CHECK-CVT-SD-LABEL: div_h:
+; CHECK-CVT-SD:       // %bb.0: // %entry
+; CHECK-CVT-SD-NEXT:    fcvtl v1.4s, v1.4h
+; CHECK-CVT-SD-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-CVT-SD-NEXT:    fdiv v0.4s, v0.4s, v1.4s
+; CHECK-CVT-SD-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-CVT-SD-NEXT:    ret
 ;
 ; CHECK-FP16-LABEL: div_h:
 ; CHECK-FP16:       // %bb.0: // %entry
 ; CHECK-FP16-NEXT:    fdiv v0.4h, v0.4h, v1.4h
 ; CHECK-FP16-NEXT:    ret
+;
+; CHECK-CVT-GI-LABEL: div_h:
+; CHECK-CVT-GI:       // %bb.0: // %entry
+; CHECK-CVT-GI-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-CVT-GI-NEXT:    fcvtl v1.4s, v1.4h
+; CHECK-CVT-GI-NEXT:    fdiv v0.4s, v0.4s, v1.4s
+; CHECK-CVT-GI-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-CVT-GI-NEXT:    ret
 entry:
 
   %0 = fdiv <4 x half> %a, %b
@@ -89,92 +141,162 @@ entry:
 }
 
 define <4 x half> @load_h(ptr %a) {
-; CHECK-COMMON-LABEL: load_h:
-; CHECK-COMMON:       // %bb.0: // %entry
-; CHECK-COMMON-NEXT:    ldr d0, [x0]
-; CHECK-COMMON-NEXT:    ret
+; CHECK-LABEL: load_h:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldr d0, [x0]
+; CHECK-NEXT:    ret
 entry:
   %0 = load <4 x half>, ptr %a, align 4
   ret <4 x half> %0
 }
 
 define void @store_h(ptr %a, <4 x half> %b) {
-; CHECK-COMMON-LABEL: store_h:
-; CHECK-COMMON:       // %bb.0: // %entry
-; CHECK-COMMON-NEXT:    str d0, [x0]
-; CHECK-COMMON-NEXT:    ret
+; CHECK-LABEL: store_h:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str d0, [x0]
+; CHECK-NEXT:    ret
 entry:
   store <4 x half> %b, ptr %a, align 4
   ret void
 }
 
 define <4 x half> @s_to_h(<4 x float> %a) {
-; CHECK-COMMON-LABEL: s_to_h:
-; CHECK-COMMON:       // %bb.0:
-; CHECK-COMMON-NEXT:    fcvtn v0.4h, v0.4s
-; CHECK-COMMON-NEXT:    ret
+; CHECK-LABEL: s_to_h:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-NEXT:    ret
   %1 = fptrunc <4 x float> %a to <4 x half>
   ret <4 x half> %1
 }
 
 define <4 x half> @d_to_h(<4 x double> %a) {
-; CHECK-COMMON-LABEL: d_to_h:
-; CHECK-COMMON:       // %bb.0:
-; CHECK-COMMON-NEXT:    fcvtxn v0.2s, v0.2d
-; CHECK-COMMON-NEXT:    fcvtxn2 v0.4s, v1.2d
-; CHECK-COMMON-NEXT:    fcvtn v0.4h, v0.4s
-; CHECK-COMMON-NEXT:    ret
+; CHECK-CVT-SD-LABEL: d_to_h:
+; CHECK-CVT-SD:       // %bb.0:
+; CHECK-CVT-SD-NEXT:    fcvtxn v0.2s, v0.2d
+; CHECK-CVT-SD-NEXT:    fcvtxn2 v0.4s, v1.2d
+; CHECK-CVT-SD-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-CVT-SD-NEXT:    ret
+;
+; CHECK-FP16-SD-LABEL: d_to_h:
+; CHECK-FP16-SD:       // %bb.0:
+; CHECK-FP16-SD-NEXT:    fcvtxn v0.2s, v0.2d
+; CHECK-FP16-SD-NEXT:    fcvtxn2 v0.4s, v1.2d
+; CHECK-FP16-SD-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-FP16-SD-NEXT:    ret
+;
+; CHECK-CVT-GI-LABEL: d_to_h:
+; CHECK-CVT-GI:       // %bb.0:
+; CHECK-CVT-GI-NEXT:    mov d2, v0.d[1]
+; CHECK-CVT-GI-NEXT:    fcvt h0, d0
+; CHECK-CVT-GI-NEXT:    mov d3, v1.d[1]
+; CHECK-CVT-GI-NEXT:    fcvt h1, d1
+; CHECK-CVT-GI-NEXT:    fcvt h2, d2
+; CHECK-CVT-GI-NEXT:    mov v0.h[1], v2.h[0]
+; CHECK-CVT-GI-NEXT:    fcvt h2, d3
+; CHECK-CVT-GI-NEXT:    mov v0.h[2], v1.h[0]
+; CHECK-CVT-GI-NEXT:    mov v0.h[3], v2.h[0]
+; CHECK-CVT-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-CVT-GI-NEXT:    ret
+;
+; CHECK-FP16-GI-LABEL: d_to_h:
+; CHECK-FP16-GI:       // %bb.0:
+; CHECK-FP16-GI-NEXT:    mov d2, v0.d[1]
+; CHECK-FP16-GI-NEXT:    fcvt h0, d0
+; CHECK-FP16-GI-NEXT:    mov d3, v1.d[1]
+; CHECK-FP16-GI-NEXT:    fcvt h1, d1
+; CHECK-FP16-GI-NEXT:    fcvt h2, d2
+; CHECK-FP16-GI-NEXT:    mov v0.h[1], v2.h[0]
+; CHECK-FP16-GI-NEXT:    fcvt h2, d3
+; CHECK-FP16-GI-NEXT:    mov v0.h[2], v1.h[0]
+; CHECK-FP16-GI-NEXT:    mov v0.h[3], v2.h[0]
+; CHECK-FP16-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-FP16-GI-NEXT:    ret
   %1 = fptrunc <4 x double> %a to <4 x half>
   ret <4 x half> %1
 }
 
 define <4 x float> @h_to_s(<4 x half> %a) {
-; CHECK-COMMON-LABEL: h_to_s:
-; CHECK-COMMON:       // %bb.0:
-; CHECK-COMMON-NEXT:    fcvtl v0.4s, v0.4h
-; CHECK-COMMON-NEXT:    ret
+; CHECK-LABEL: h_to_s:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-NEXT:    ret
   %1 = fpext <4 x half> %a to <4 x float>
   ret <4 x float> %1
 }
 
 define <4 x double> @h_to_d(<4 x half> %a) {
-; CHECK-COMMON-LABEL: h_to_d:
-; CHECK-COMMON:       // %bb.0:
-; CHECK-COMMON-NEXT:    fcvtl v0.4s, v0.4h
-; CHECK-COMMON-NEXT:    fcvtl2 v1.2d, v0.4s
-; CHECK-COMMON-NEXT:    fcvtl v0.2d, v0.2s
-; CHECK-COMMON-NEXT:    ret
+; CHECK-CVT-SD-LABEL: h_to_d:
+; CHECK-CVT-SD:       // %bb.0:
+; CHECK-CVT-SD-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-CVT-SD-NEXT:    fcvtl2 v1.2d, v0.4s
+; CHECK-CVT-SD-NEXT:    fcvtl v0.2d, v0.2s
+; CHECK-CVT-SD-NEXT:    ret
+;
+; CHECK-FP16-SD-LABEL: h_to_d:
+; CHECK-FP16-SD:       // %bb.0:
+; CHECK-FP16-SD-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-FP16-SD-NEXT:    fcvtl2 v1.2d, v0.4s
+; CHECK-FP16-SD-NEXT:    fcvtl v0.2d, v0.2s
+; CHECK-FP16-SD-NEXT:    ret
+;
+; CHECK-CVT-GI-LABEL: h_to_d:
+; CHECK-CVT-GI:       // %bb.0:
+; CHECK-CVT-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-CVT-GI-NEXT:    mov h1, v0.h[1]
+; CHECK-CVT-GI-NEXT:    mov h2, v0.h[2]
+; CHECK-CVT-GI-NEXT:    mov h3, v0.h[3]
+; CHECK-CVT-GI-NEXT:    fcvt d0, h0
+; CHECK-CVT-GI-NEXT:    fcvt d4, h1
+; CHECK-CVT-GI-NEXT:    fcvt d1, h2
+; CHECK-CVT-GI-NEXT:    fcvt d2, h3
+; CHECK-CVT-GI-NEXT:    mov v0.d[1], v4.d[0]
+; CHECK-CVT-GI-NEXT:    mov v1.d[1], v2.d[0]
+; CHECK-CVT-GI-NEXT:    ret
+;
+; CHECK-FP16-GI-LABEL: h_to_d:
+; CHECK-FP16-GI:       // %bb.0:
+; CHECK-FP16-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-FP16-GI-NEXT:    mov h1, v0.h[1]
+; CHECK-FP16-GI-NEXT:    mov h2, v0.h[2]
+; CHECK-FP16-GI-NEXT:    mov h3, v0.h[3]
+; CHECK-FP16-GI-NEXT:    fcvt d0, h0
+; CHECK-FP16-GI-NEXT:    fcvt d4, h1
+; CHECK-FP16-GI-NEXT:    fcvt d1, h2
+; CHECK-FP16-GI-NEXT:    fcvt d2, h3
+; CHECK-FP16-GI-NEXT:    mov v0.d[1], v4.d[0]
+; CHECK-FP16-GI-NEXT:    mov v1.d[1], v2.d[0]
+; CHECK-FP16-GI-NEXT:    ret
   %1 = fpext <4 x half> %a to <4 x double>
   ret <4 x double> %1
 }
 
 define <4 x half> @bitcast_i_to_h(float, <4 x i16> %a) {
-; CHECK-COMMON-LABEL: bitcast_i_to_h:
-; CHECK-COMMON:       // %bb.0:
-; CHECK-COMMON-NEXT:    fmov d0, d1
-; CHECK-COMMON-NEXT:    ret
+; CHECK-LABEL: bitcast_i_to_h:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmov d0, d1
+; CHECK-NEXT:    ret
   %2 = bitcast <4 x i16> %a to <4 x half>
   ret <4 x half> %2
 }
 
 define <4 x i16> @bitcast_h_to_i(float, <4 x half> %a) {
-; CHECK-COMMON-LABEL: bitcast_h_to_i:
-; CHECK-COMMON:       // %bb.0:
-; CHECK-COMMON-NEXT:    fmov d0, d1
-; CHECK-COMMON-NEXT:    ret
+; CHECK-LABEL: bitcast_h_to_i:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmov d0, d1
+; CHECK-NEXT:    ret
   %2 = bitcast <4 x half> %a to <4 x i16>
   ret <4 x i16> %2
 }
 
 define <4 x half> @sitofp_i8(<4 x i8> %a) #0 {
-; CHECK-CVT-LABEL: sitofp_i8:
-; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    shl v0.4h, v0.4h, #8
-; CHECK-CVT-NEXT:    sshr v0.4h, v0.4h, #8
-; CHECK-CVT-NEXT:    sshll v0.4s, v0.4h, #0
-; CHECK-CVT-NEXT:    scvtf v0.4s, v0.4s
-; CHECK-CVT-NEXT:    fcvtn v0.4h, v0.4s
-; CHECK-CVT-NEXT:    ret
+; CHECK-CVT-SD-LABEL: sitofp_i8:
+; CHECK-CVT-SD:       // %bb.0:
+; CHECK-CVT-SD-NEXT:    shl v0.4h, v0.4h, #8
+; CHECK-CVT-SD-NEXT:    sshr v0.4h, v0.4h, #8
+; CHECK-CVT-SD-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-CVT-SD-NEXT:    scvtf v0.4s, v0.4s
+; CHECK-CVT-SD-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-CVT-SD-NEXT:    ret
 ;
 ; CHECK-FP16-LABEL: sitofp_i8:
 ; CHECK-FP16:       // %bb.0:
@@ -182,6 +304,15 @@ define <4 x half> @sitofp_i8(<4 x i8> %a) #0 {
 ; CHECK-FP16-NEXT:    sshr v0.4h, v0.4h, #8
 ; CHECK-FP16-NEXT:    scvtf v0.4h, v0.4h
 ; CHECK-FP16-NEXT:    ret
+;
+; CHECK-CVT-GI-LABEL: sitofp_i8:
+; CHECK-CVT-GI:       // %bb.0:
+; CHECK-CVT-GI-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-CVT-GI-NEXT:    shl v0.4s, v0.4s, #24
+; CHECK-CVT-GI-NEXT:    sshr v0.4s, v0.4s, #24
+; CHECK-CVT-GI-NEXT:    scvtf v0.4s, v0.4s
+; CHECK-CVT-GI-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-CVT-GI-NEXT:    ret
   %1 = sitofp <4 x i8> %a to <4 x half>
   ret <4 x half> %1
 }
@@ -204,43 +335,59 @@ define <4 x half> @sitofp_i16(<4 x i16> %a) #0 {
 
 
 define <4 x half> @sitofp_i32(<4 x i32> %a) #0 {
-; CHECK-COMMON-LABEL: sitofp_i32:
-; CHECK-COMMON:       // %bb.0:
-; CHECK-COMMON-NEXT:    scvtf v0.4s, v0.4s
-; CHECK-COMMON-NEXT:    fcvtn v0.4h, v0.4s
-; CHECK-COMMON-NEXT:    ret
+; CHECK-LABEL: sitofp_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    scvtf v0.4s, v0.4s
+; CHECK-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-NEXT:    ret
   %1 = sitofp <4 x i32> %a to <4 x half>
   ret <4 x half> %1
 }
 
 
 define <4 x half> @sitofp_i64(<4 x i64> %a) #0 {
-; CHECK-COMMON-LABEL: sitofp_i64:
-; CHECK-COMMON:       // %bb.0:
-; CHECK-COMMON-NEXT:    scvtf v0.2d, v0.2d
-; CHECK-COMMON-NEXT:    scvtf v1.2d, v1.2d
-; CHECK-COMMON-NEXT:    fcvtn v0.2s, v0.2d
-; CHECK-COMMON-NEXT:    fcvtn2 v0.4s, v1.2d
-; CHECK-COMMON-NEXT:    fcvtn v0.4h, v0.4s
-; CHECK-COMMON-NEXT:    ret
+; CHECK-LABEL: sitofp_i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    scvtf v0.2d, v0.2d
+; CHECK-NEXT:    scvtf v1.2d, v1.2d
+; CHECK-NEXT:    fcvtn v0.2s, v0.2d
+; CHECK-NEXT:    fcvtn2 v0.4s, v1.2d
+; CHECK-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-NEXT:    ret
   %1 = sitofp <4 x i64> %a to <4 x half>
   ret <4 x half> %1
 }
 
 define <4 x half> @uitofp_i8(<4 x i8> %a) #0 {
-; CHECK-CVT-LABEL: uitofp_i8:
-; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    bic v0.4h, #255, lsl #8
-; CHECK-CVT-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-CVT-NEXT:    ucvtf v0.4s, v0.4s
-; CHECK-CVT-NEXT:    fcvtn v0.4h, v0.4s
-; CHECK-CVT-NEXT:    ret
+; CHECK-CVT-SD-LABEL: uitofp_i8:
+; CHECK-CVT-SD:       // %bb.0:
+; CHECK-CVT-SD-NEXT:    bic v0.4h, #255, lsl #8
+; CHECK-CVT-SD-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-CVT-SD-NEXT:    ucvtf v0.4s, v0.4s
+; CHECK-CVT-SD-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-CVT-SD-NEXT:    ret
 ;
-; CHECK-FP16-LABEL: uitofp_i8:
-; CHECK-FP16:       // %bb.0:
-; CHECK-FP16-NEXT:    bic v0.4h, #255, lsl #8
-; CHECK-FP16-NEXT:    ucvtf v0.4h, v0.4h
-; CHECK-FP16-NEXT:    ret
+; CHECK-FP16-SD-LABEL: uitofp_i8:
+; CHECK-FP16-SD:       // %bb.0:
+; CHECK-FP16-SD-NEXT:    bic v0.4h, #255, lsl #8
+; CHECK-FP16-SD-NEXT:    ucvtf v0.4h, v0.4h
+; CHECK-FP16-SD-NEXT:    ret
+;
+; CHECK-CVT-GI-LABEL: uitofp_i8:
+; CHECK-CVT-GI:       // %bb.0:
+; CHECK-CVT-GI-NEXT:    movi v1.2d, #0x0000ff000000ff
+; CHECK-CVT-GI-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-CVT-GI-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-CVT-GI-NEXT:    ucvtf v0.4s, v0.4s
+; CHECK-CVT-GI-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-CVT-GI-NEXT:    ret
+;
+; CHECK-FP16-GI-LABEL: uitofp_i8:
+; CHECK-FP16-GI:       // %bb.0:
+; CHECK-FP16-GI-NEXT:    movi d1, #0xff00ff00ff00ff
+; CHECK-FP16-GI-NEXT:    and v0.8b, v0.8b, v1.8b
+; CHECK-FP16-GI-NEXT:    ucvtf v0.4h, v0.4h
+; CHECK-FP16-GI-NEXT:    ret
   %1 = uitofp <4 x i8> %a to <4 x half>
   ret <4 x half> %1
 }
@@ -264,35 +411,35 @@ define <4 x half> @uitofp_i16(<4 x i16> %a) #0 {
 
 
 define <4 x half> @uitofp_i32(<4 x i32> %a) #0 {
-; CHECK-COMMON-LABEL: uitofp_i32:
-; CHECK-COMMON:       // %bb.0:
-; CHECK-COMMON-NEXT:    ucvtf v0.4s, v0.4s
-; CHECK-COMMON-NEXT:    fcvtn v0.4h, v0.4s
-; CHECK-COMMON-NEXT:    ret
+; CHECK-LABEL: uitofp_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ucvtf v0.4s, v0.4s
+; CHECK-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-NEXT:    ret
   %1 = uitofp <4 x i32> %a to <4 x half>
   ret <4 x half> %1
 }
 
 
 define <4 x half> @uitofp_i64(<4 x i64> %a) #0 {
-; CHECK-COMMON-LABEL: uitofp_i64:
-; CHECK-COMMON:       // %bb.0:
-; CHECK-COMMON-NEXT:    ucvtf v0.2d, v0.2d
-; CHECK-COMMON-NEXT:    ucvtf v1.2d, v1.2d
-; CHECK-COMMON-NEXT:    fcvtn v0.2s, v0.2d
-; CHECK-COMMON-NEXT:    fcvtn2 v0.4s, v1.2d
-; CHECK-COMMON-NEXT:    fcvtn v0.4h, v0.4s
-; CHECK-COMMON-NEXT:    ret
+; CHECK-LABEL: uitofp_i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ucvtf v0.2d, v0.2d
+; CHECK-NEXT:    ucvtf v1.2d, v1.2d
+; CHECK-NEXT:    fcvtn v0.2s, v0.2d
+; CHECK-NEXT:    fcvtn2 v0.4s, v1.2d
+; CHECK-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-NEXT:    ret
   %1 = uitofp <4 x i64> %a to <4 x half>
   ret <4 x half> %1
 }
 
 define void @test_insert_at_zero(half %a, ptr %b) #0 {
-; CHECK-COMMON-LABEL: test_insert_at_zero:
-; CHECK-COMMON:       // %bb.0:
-; CHECK-COMMON-NEXT:    // kill: def $h0 killed $h0 def $d0
-; CHECK-COMMON-NEXT:    str d0, [x0]
-; CHECK-COMMON-NEXT:    ret
+; CHECK-LABEL: test_insert_at_zero:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-NEXT:    str d0, [x0]
+; CHECK-NEXT:    ret
   %1 = insertelement <4 x half> undef, half %a, i64 0
   store <4 x half> %1, ptr %b, align 4
   ret void
@@ -331,17 +478,29 @@ define <4 x i16> @fptosi_i16(<4 x half> %a) #0 {
 }
 
 define <4 x i8> @fptoui_i8(<4 x half> %a) #0 {
-; CHECK-CVT-LABEL: fptoui_i8:
-; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    fcvtl v0.4s, v0.4h
-; CHECK-CVT-NEXT:    fcvtzs v0.4s, v0.4s
-; CHECK-CVT-NEXT:    xtn v0.4h, v0.4s
-; CHECK-CVT-NEXT:    ret
+; CHECK-CVT-SD-LABEL: fptoui_i8:
+; CHECK-CVT-SD:       // %bb.0:
+; CHECK-CVT-SD-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-CVT-SD-NEXT:    fcvtzs v0.4s, v0.4s
+; CHECK-CVT-SD-NEXT:    xtn v0.4h, v0.4s
+; CHECK-CVT-SD-NEXT:    ret
 ;
-; CHECK-FP16-LABEL: fptoui_i8:
-; CHECK-FP16:       // %bb.0:
-; CHECK-FP16-NEXT:    fcvtzs v0.4h, v0.4h
-; CHECK-FP16-NEXT:    ret
+; CHECK-FP16-SD-LABEL: fptoui_i8:
+; CHECK-FP16-SD:       // %bb.0:
+; CHECK-FP16-SD-NEXT:    fcvtzs v0.4h, v0.4h
+; CHECK-FP16-SD-NEXT:    ret
+;
+; CHECK-CVT-GI-LABEL: fptoui_i8:
+; CHECK-CVT-GI:       // %bb.0:
+; CHECK-CVT-GI-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-CVT-GI-NEXT:    fcvtzu v0.4s, v0.4s
+; CHECK-CVT-GI-NEXT:    xtn v0.4h, v0.4s
+; CHECK-CVT-GI-NEXT:    ret
+;
+; CHECK-FP16-GI-LABEL: fptoui_i8:
+; CHECK-FP16-GI:       // %bb.0:
+; CHECK-FP16-GI-NEXT:    fcvtzu v0.4h, v0.4h
+; CHECK-FP16-GI-NEXT:    ret
 ; NOTE: fcvtzs selected here because the xtn shaves the sign bit
   %1 = fptoui<4 x half> %a to <4 x i8>
   ret <4 x i8> %1
@@ -364,36 +523,45 @@ define <4 x i16> @fptoui_i16(<4 x half> %a) #0 {
 }
 
 define <4 x i1> @test_fcmp_une(<4 x half> %a, <4 x half> %b) #0 {
-; CHECK-CVT-LABEL: test_fcmp_une:
-; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    fcvtl v1.4s, v1.4h
-; CHECK-CVT-NEXT:    fcvtl v0.4s, v0.4h
-; CHECK-CVT-NEXT:    fcmeq v0.4s, v0.4s, v1.4s
-; CHECK-CVT-NEXT:    mvn v0.16b, v0.16b
-; CHECK-CVT-NEXT:    xtn v0.4h, v0.4s
-; CHECK-CVT-NEXT:    ret
+; CHECK-CVT-SD-LABEL: test_fcmp_une:
+; CHECK-CVT-SD:       // %bb.0:
+; CHECK-CVT-SD-NEXT:    fcvtl v1.4s, v1.4h
+; CHECK-CVT-SD-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-CVT-SD-NEXT:    fcmeq v0.4s, v0.4s, v1.4s
+; CHECK-CVT-SD-NEXT:    mvn v0.16b, v0.16b
+; CHECK-CVT-SD-NEXT:    xtn v0.4h, v0.4s
+; CHECK-CVT-SD-NEXT:    ret
 ;
 ; CHECK-FP16-LABEL: test_fcmp_une:
 ; CHECK-FP16:       // %bb.0:
 ; CHECK-FP16-NEXT:    fcmeq v0.4h, v0.4h, v1.4h
 ; CHECK-FP16-NEXT:    mvn v0.8b, v0.8b
 ; CHECK-FP16-NEXT:    ret
+;
+; CHECK-CVT-GI-LABEL: test_fcmp_une:
+; CHECK-CVT-GI:       // %bb.0:
+; CHECK-CVT-GI-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-CVT-GI-NEXT:    fcvtl v1.4s, v1.4h
+; CHECK-CVT-GI-NEXT:    fcmeq v0.4s, v0.4s, v1.4s
+; CHECK-CVT-GI-NEXT:    mvn v0.16b, v0.16b
+; CHECK-CVT-GI-NEXT:    xtn v0.4h, v0.4s
+; CHECK-CVT-GI-NEXT:    ret
 
   %1 = fcmp une <4 x half> %a, %b
   ret <4 x i1> %1
 }
 
 define <4 x i1> @test_fcmp_ueq(<4 x half> %a, <4 x half> %b) #0 {
-; CHECK-CVT-LABEL: test_fcmp_ueq:
-; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    fcvtl v1.4s, v1.4h
-; CHECK-CVT-NEXT:    fcvtl v0.4s, v0.4h
-; CHECK-CVT-NEXT:    fcmgt v2.4s, v0.4s, v1.4s
-; CHECK-CVT-NEXT:    fcmgt v0.4s, v1.4s, v0.4s
-; CHECK-CVT-NEXT:    orr v0.16b, v0.16b, v2.16b
-; CHECK-CVT-NEXT:    xtn v0.4h, v0.4s
-; CHECK-CVT-NEXT:    mvn v0.8b, v0.8b
-; CHECK-CVT-NEXT:    ret
+; CHECK-CVT-SD-LABEL: test_fcmp_ueq:
+; CHECK-CVT-SD:       // %bb.0:
+; CHECK-CVT-SD-NEXT:    fcvtl v1.4s, v1.4h
+; CHECK-CVT-SD-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-CVT-SD-NEXT:    fcmgt v2.4s, v0.4s, v1.4s
+; CHECK-CVT-SD-NEXT:    fcmgt v0.4s, v1.4s, v0.4s
+; CHECK-CVT-SD-NEXT:    orr v0.16b, v0.16b, v2.16b
+; CHECK-CVT-SD-NEXT:    xtn v0.4h, v0.4s
+; CHECK-CVT-SD-NEXT:    mvn v0.8b, v0.8b
+; CHECK-CVT-SD-NEXT:    ret
 ;
 ; CHECK-FP16-LABEL: test_fcmp_ueq:
 ; CHECK-FP16:       // %bb.0:
@@ -402,102 +570,149 @@ define <4 x i1> @test_fcmp_ueq(<4 x half> %a, <4 x half> %b) #0 {
 ; CHECK-FP16-NEXT:    orr v0.8b, v0.8b, v2.8b
 ; CHECK-FP16-NEXT:    mvn v0.8b, v0.8b
 ; CHECK-FP16-NEXT:    ret
+;
+; CHECK-CVT-GI-LABEL: test_fcmp_ueq:
+; CHECK-CVT-GI:       // %bb.0:
+; CHECK-CVT-GI-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-CVT-GI-NEXT:    fcvtl v1.4s, v1.4h
+; CHECK-CVT-GI-NEXT:    fcmgt v2.4s, v0.4s, v1.4s
+; CHECK-CVT-GI-NEXT:    fcmgt v0.4s, v1.4s, v0.4s
+; CHECK-CVT-GI-NEXT:    orr v0.16b, v0.16b, v2.16b
+; CHECK-CVT-GI-NEXT:    mvn v0.16b, v0.16b
+; CHECK-CVT-GI-NEXT:    xtn v0.4h, v0.4s
+; CHECK-CVT-GI-NEXT:    ret
 
   %1 = fcmp ueq <4 x half> %a, %b
   ret <4 x i1> %1
 }
 
 define <4 x i1> @test_fcmp_ugt(<4 x half> %a, <4 x half> %b) #0 {
-; CHECK-CVT-LABEL: test_fcmp_ugt:
-; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    fcvtl v0.4s, v0.4h
-; CHECK-CVT-NEXT:    fcvtl v1.4s, v1.4h
-; CHECK-CVT-NEXT:    fcmge v0.4s, v1.4s, v0.4s
-; CHECK-CVT-NEXT:    xtn v0.4h, v0.4s
-; CHECK-CVT-NEXT:    mvn v0.8b, v0.8b
-; CHECK-CVT-NEXT:    ret
+; CHECK-CVT-SD-LABEL: test_fcmp_ugt:
+; CHECK-CVT-SD:       // %bb.0:
+; CHECK-CVT-SD-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-CVT-SD-NEXT:    fcvtl v1.4s, v1.4h
+; CHECK-CVT-SD-NEXT:    fcmge v0.4s, v1.4s, v0.4s
+; CHECK-CVT-SD-NEXT:    xtn v0.4h, v0.4s
+; CHECK-CVT-SD-NEXT:    mvn v0.8b, v0.8b
+; CHECK-CVT-SD-NEXT:    ret
 ;
 ; CHECK-FP16-LABEL: test_fcmp_ugt:
 ; CHECK-FP16:       // %bb.0:
 ; CHECK-FP16-NEXT:    fcmge v0.4h, v1.4h, v0.4h
 ; CHECK-FP16-NEXT:    mvn v0.8b, v0.8b
 ; CHECK-FP16-NEXT:    ret
+;
+; CHECK-CVT-GI-LABEL: test_fcmp_ugt:
+; CHECK-CVT-GI:       // %bb.0:
+; CHECK-CVT-GI-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-CVT-GI-NEXT:    fcvtl v1.4s, v1.4h
+; CHECK-CVT-GI-NEXT:    fcmge v0.4s, v1.4s, v0.4s
+; CHECK-CVT-GI-NEXT:    mvn v0.16b, v0.16b
+; CHECK-CVT-GI-NEXT:    xtn v0.4h, v0.4s
+; CHECK-CVT-GI-NEXT:    ret
 
   %1 = fcmp ugt <4 x half> %a, %b
   ret <4 x i1> %1
 }
 
 define <4 x i1> @test_fcmp_uge(<4 x half> %a, <4 x half> %b) #0 {
-; CHECK-CVT-LABEL: test_fcmp_uge:
-; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    fcvtl v0.4s, v0.4h
-; CHECK-CVT-NEXT:    fcvtl v1.4s, v1.4h
-; CHECK-CVT-NEXT:    fcmgt v0.4s, v1.4s, v0.4s
-; CHECK-CVT-NEXT:    xtn v0.4h, v0.4s
-; CHECK-CVT-NEXT:    mvn v0.8b, v0.8b
-; CHECK-CVT-NEXT:    ret
+; CHECK-CVT-SD-LABEL: test_fcmp_uge:
+; CHECK-CVT-SD:       // %bb.0:
+; CHECK-CVT-SD-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-CVT-SD-NEXT:    fcvtl v1.4s, v1.4h
+; CHECK-CVT-SD-NEXT:    fcmgt v0.4s, v1.4s, v0.4s
+; CHECK-CVT-SD-NEXT:    xtn v0.4h, v0.4s
+; CHECK-CVT-SD-NEXT:    mvn v0.8b, v0.8b
+; CHECK-CVT-SD-NEXT:    ret
 ;
 ; CHECK-FP16-LABEL: test_fcmp_uge:
 ; CHECK-FP16:       // %bb.0:
 ; CHECK-FP16-NEXT:    fcmgt v0.4h, v1.4h, v0.4h
 ; CHECK-FP16-NEXT:    mvn v0.8b, v0.8b
 ; CHECK-FP16-NEXT:    ret
+;
+; CHECK-CVT-GI-LABEL: test_fcmp_uge:
+; CHECK-CVT-GI:       // %bb.0:
+; CHECK-CVT-GI-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-CVT-GI-NEXT:    fcvtl v1.4s, v1.4h
+; CHECK-CVT-GI-NEXT:    fcmgt v0.4s, v1.4s, v0.4s
+; CHECK-CVT-GI-NEXT:    mvn v0.16b, v0.16b
+; CHECK-CVT-GI-NEXT:    xtn v0.4h, v0.4s
+; CHECK-CVT-GI-NEXT:    ret
 
   %1 = fcmp uge <4 x half> %a, %b
   ret <4 x i1> %1
 }
 
 define <4 x i1> @test_fcmp_ult(<4 x half> %a, <4 x half> %b) #0 {
-; CHECK-CVT-LABEL: test_fcmp_ult:
-; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    fcvtl v1.4s, v1.4h
-; CHECK-CVT-NEXT:    fcvtl v0.4s, v0.4h
-; CHECK-CVT-NEXT:    fcmge v0.4s, v0.4s, v1.4s
-; CHECK-CVT-NEXT:    xtn v0.4h, v0.4s
-; CHECK-CVT-NEXT:    mvn v0.8b, v0.8b
-; CHECK-CVT-NEXT:    ret
+; CHECK-CVT-SD-LABEL: test_fcmp_ult:
+; CHECK-CVT-SD:       // %bb.0:
+; CHECK-CVT-SD-NEXT:    fcvtl v1.4s, v1.4h
+; CHECK-CVT-SD-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-CVT-SD-NEXT:    fcmge v0.4s, v0.4s, v1.4s
+; CHECK-CVT-SD-NEXT:    xtn v0.4h, v0.4s
+; CHECK-CVT-SD-NEXT:    mvn v0.8b, v0.8b
+; CHECK-CVT-SD-NEXT:    ret
 ;
 ; CHECK-FP16-LABEL: test_fcmp_ult:
 ; CHECK-FP16:       // %bb.0:
 ; CHECK-FP16-NEXT:    fcmge v0.4h, v0.4h, v1.4h
 ; CHECK-FP16-NEXT:    mvn v0.8b, v0.8b
 ; CHECK-FP16-NEXT:    ret
+;
+; CHECK-CVT-GI-LABEL: test_fcmp_ult:
+; CHECK-CVT-GI:       // %bb.0:
+; CHECK-CVT-GI-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-CVT-GI-NEXT:    fcvtl v1.4s, v1.4h
+; CHECK-CVT-GI-NEXT:    fcmge v0.4s, v0.4s, v1.4s
+; CHECK-CVT-GI-NEXT:    mvn v0.16b, v0.16b
+; CHECK-CVT-GI-NEXT:    xtn v0.4h, v0.4s
+; CHECK-CVT-GI-NEXT:    ret
 
   %1 = fcmp ult <4 x half> %a, %b
   ret <4 x i1> %1
 }
 
 define <4 x i1> @test_fcmp_ule(<4 x half> %a, <4 x half> %b) #0 {
-; CHECK-CVT-LABEL: test_fcmp_ule:
-; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    fcvtl v1.4s, v1.4h
-; CHECK-CVT-NEXT:    fcvtl v0.4s, v0.4h
-; CHECK-CVT-NEXT:    fcmgt v0.4s, v0.4s, v1.4s
-; CHECK-CVT-NEXT:    xtn v0.4h, v0.4s
-; CHECK-CVT-NEXT:    mvn v0.8b, v0.8b
-; CHECK-CVT-NEXT:    ret
+; CHECK-CVT-SD-LABEL: test_fcmp_ule:
+; CHECK-CVT-SD:       // %bb.0:
+; CHECK-CVT-SD-NEXT:    fcvtl v1.4s, v1.4h
+; CHECK-CVT-SD-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-CVT-SD-NEXT:    fcmgt v0.4s, v0.4s, v1.4s
+; CHECK-CVT-SD-NEXT:    xtn v0.4h, v0.4s
+; CHECK-CVT-SD-NEXT:    mvn v0.8b, v0.8b
+; CHECK-CVT-SD-NEXT:    ret
 ;
 ; CHECK-FP16-LABEL: test_fcmp_ule:
 ; CHECK-FP16:       // %bb.0:
 ; CHECK-FP16-NEXT:    fcmgt v0.4h, v0.4h, v1.4h
 ; CHECK-FP16-NEXT:    mvn v0.8b, v0.8b
 ; CHECK-FP16-NEXT:    ret
+;
+; CHECK-CVT-GI-LABEL: test_fcmp_ule:
+; CHECK-CVT-GI:       // %bb.0:
+; CHECK-CVT-GI-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-CVT-GI-NEXT:    fcvtl v1.4s, v1.4h
+; CHECK-CVT-GI-NEXT:    fcmgt v0.4s, v0.4s, v1.4s
+; CHECK-CVT-GI-NEXT:    mvn v0.16b, v0.16b
+; CHECK-CVT-GI-NEXT:    xtn v0.4h, v0.4s
+; CHECK-CVT-GI-NEXT:    ret
 
   %1 = fcmp ule <4 x half> %a, %b
   ret <4 x i1> %1
 }
 
 define <4 x i1> @test_fcmp_uno(<4 x half> %a, <4 x half> %b) #0 {
-; CHECK-CVT-LABEL: test_fcmp_uno:
-; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    fcvtl v1.4s, v1.4h
-; CHECK-CVT-NEXT:    fcvtl v0.4s, v0.4h
-; CHECK-CVT-NEXT:    fcmge v2.4s, v0.4s, v1.4s
-; CHECK-CVT-NEXT:    fcmgt v0.4s, v1.4s, v0.4s
-; CHECK-CVT-NEXT:    orr v0.16b, v0.16b, v2.16b
-; CHECK-CVT-NEXT:    xtn v0.4h, v0.4s
-; CHECK-CVT-NEXT:    mvn v0.8b, v0.8b
-; CHECK-CVT-NEXT:    ret
+; CHECK-CVT-SD-LABEL: test_fcmp_uno:
+; CHECK-CVT-SD:       // %bb.0:
+; CHECK-CVT-SD-NEXT:    fcvtl v1.4s, v1.4h
+; CHECK-CVT-SD-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-CVT-SD-NEXT:    fcmge v2.4s, v0.4s, v1.4s
+; CHECK-CVT-SD-NEXT:    fcmgt v0.4s, v1.4s, v0.4s
+; CHECK-CVT-SD-NEXT:    orr v0.16b, v0.16b, v2.16b
+; CHECK-CVT-SD-NEXT:    xtn v0.4h, v0.4s
+; CHECK-CVT-SD-NEXT:    mvn v0.8b, v0.8b
+; CHECK-CVT-SD-NEXT:    ret
 ;
 ; CHECK-FP16-LABEL: test_fcmp_uno:
 ; CHECK-FP16:       // %bb.0:
@@ -506,21 +721,32 @@ define <4 x i1> @test_fcmp_uno(<4 x half> %a, <4 x half> %b) #0 {
 ; CHECK-FP16-NEXT:    orr v0.8b, v0.8b, v2.8b
 ; CHECK-FP16-NEXT:    mvn v0.8b, v0.8b
 ; CHECK-FP16-NEXT:    ret
+;
+; CHECK-CVT-GI-LABEL: test_fcmp_uno:
+; CHECK-CVT-GI:       // %bb.0:
+; CHECK-CVT-GI-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-CVT-GI-NEXT:    fcvtl v1.4s, v1.4h
+; CHECK-CVT-GI-NEXT:    fcmge v2.4s, v0.4s, v1.4s
+; CHECK-CVT-GI-NEXT:    fcmgt v0.4s, v1.4s, v0.4s
+; CHECK-CVT-GI-NEXT:    orr v0.16b, v0.16b, v2.16b
+; CHECK-CVT-GI-NEXT:    mvn v0.16b, v0.16b
+; CHECK-CVT-GI-NEXT:    xtn v0.4h, v0.4s
+; CHECK-CVT-GI-NEXT:    ret
 
   %1 = fcmp uno <4 x half> %a, %b
   ret <4 x i1> %1
 }
 
 define <4 x i1> @test_fcmp_one(<4 x half> %a, <4 x half> %b) #0 {
-; CHECK-CVT-LABEL: test_fcmp_one:
-; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    fcvtl v1.4s, v1.4h
-; CHECK-CVT-NEXT:    fcvtl v0.4s, v0.4h
-; CHECK-CVT-NEXT:    fcmgt v2.4s, v0.4s, v1.4s
-; CHECK-CVT-NEXT:    fcmgt v0.4s, v1.4s, v0.4s
-; CHECK-CVT-NEXT:    orr v0.16b, v0.16b, v2.16b
-; CHECK-CVT-NEXT:    xtn v0.4h, v0.4s
-; CHECK-CVT-NEXT:    ret
+; CHECK-CVT-SD-LABEL: test_fcmp_one:
+; CHECK-CVT-SD:       // %bb.0:
+; CHECK-CVT-SD-NEXT:    fcvtl v1.4s, v1.4h
+; CHECK-CVT-SD-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-CVT-SD-NEXT:    fcmgt v2.4s, v0.4s, v1.4s
+; CHECK-CVT-SD-NEXT:    fcmgt v0.4s, v1.4s, v0.4s
+; CHECK-CVT-SD-NEXT:    orr v0.16b, v0.16b, v2.16b
+; CHECK-CVT-SD-NEXT:    xtn v0.4h, v0.4s
+; CHECK-CVT-SD-NEXT:    ret
 ;
 ; CHECK-FP16-LABEL: test_fcmp_one:
 ; CHECK-FP16:       // %bb.0:
@@ -528,60 +754,94 @@ define <4 x i1> @test_fcmp_one(<4 x half> %a, <4 x half> %b) #0 {
 ; CHECK-FP16-NEXT:    fcmgt v0.4h, v1.4h, v0.4h
 ; CHECK-FP16-NEXT:    orr v0.8b, v0.8b, v2.8b
 ; CHECK-FP16-NEXT:    ret
+;
+; CHECK-CVT-GI-LABEL: test_fcmp_one:
+; CHECK-CVT-GI:       // %bb.0:
+; CHECK-CVT-GI-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-CVT-GI-NEXT:    fcvtl v1.4s, v1.4h
+; CHECK-CVT-GI-NEXT:    fcmgt v2.4s, v0.4s, v1.4s
+; CHECK-CVT-GI-NEXT:    fcmgt v0.4s, v1.4s, v0.4s
+; CHECK-CVT-GI-NEXT:    orr v0.16b, v0.16b, v2.16b
+; CHECK-CVT-GI-NEXT:    xtn v0.4h, v0.4s
+; CHECK-CVT-GI-NEXT:    ret
 
   %1 = fcmp one <4 x half> %a, %b
   ret <4 x i1> %1
 }
 
 define <4 x i1> @test_fcmp_oeq(<4 x half> %a, <4 x half> %b) #0 {
-; CHECK-CVT-LABEL: test_fcmp_oeq:
-; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    fcvtl v1.4s, v1.4h
-; CHECK-CVT-NEXT:    fcvtl v0.4s, v0.4h
-; CHECK-CVT-NEXT:    fcmeq v0.4s, v0.4s, v1.4s
-; CHECK-CVT-NEXT:    xtn v0.4h, v0.4s
-; CHECK-CVT-NEXT:    ret
+; CHECK-CVT-SD-LABEL: test_fcmp_oeq:
+; CHECK-CVT-SD:       // %bb.0:
+; CHECK-CVT-SD-NEXT:    fcvtl v1.4s, v1.4h
+; CHECK-CVT-SD-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-CVT-SD-NEXT:    fcmeq v0.4s, v0.4s, v1.4s
+; CHECK-CVT-SD-NEXT:    xtn v0.4h, v0.4s
+; CHECK-CVT-SD-NEXT:    ret
 ;
 ; CHECK-FP16-LABEL: test_fcmp_oeq:
 ; CHECK-FP16:       // %bb.0:
 ; CHECK-FP16-NEXT:    fcmeq v0.4h, v0.4h, v1.4h
 ; CHECK-FP16-NEXT:    ret
+;
+; CHECK-CVT-GI-LABEL: test_fcmp_oeq:
+; CHECK-CVT-GI:       // %bb.0:
+; CHECK-CVT-GI-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-CVT-GI-NEXT:    fcvtl v1.4s, v1.4h
+; CHECK-CVT-GI-NEXT:    fcmeq v0.4s, v0.4s, v1.4s
+; CHECK-CVT-GI-NEXT:    xtn v0.4h, v0.4s
+; CHECK-CVT-GI-NEXT:    ret
 
   %1 = fcmp oeq <4 x half> %a, %b
   ret <4 x i1> %1
 }
 
 define <4 x i1> @test_fcmp_ogt(<4 x half> %a, <4 x half> %b) #0 {
-; CHECK-CVT-LABEL: test_fcmp_ogt:
-; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    fcvtl v1.4s, v1.4h
-; CHECK-CVT-NEXT:    fcvtl v0.4s, v0.4h
-; CHECK-CVT-NEXT:    fcmgt v0.4s, v0.4s, v1.4s
-; CHECK-CVT-NEXT:    xtn v0.4h, v0.4s
-; CHECK-CVT-NEXT:    ret
+; CHECK-CVT-SD-LABEL: test_fcmp_ogt:
+; CHECK-CVT-SD:       // %bb.0:
+; CHECK-CVT-SD-NEXT:    fcvtl v1.4s, v1.4h
+; CHECK-CVT-SD-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-CVT-SD-NEXT:    fcmgt v0.4s, v0.4s, v1.4s
+; CHECK-CVT-SD-NEXT:    xtn v0.4h, v0.4s
+; CHECK-CVT-SD-NEXT:    ret
 ;
 ; CHECK-FP16-LABEL: test_fcmp_ogt:
 ; CHECK-FP16:       // %bb.0:
 ; CHECK-FP16-NEXT:    fcmgt v0.4h, v0.4h, v1.4h
 ; CHECK-FP16-NEXT:    ret
+;
+; CHECK-CVT-GI-LABEL: test_fcmp_ogt:
+; CHECK-CVT-GI:       // %bb.0:
+; CHECK-CVT-GI-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-CVT-GI-NEXT:    fcvtl v1.4s, v1.4h
+; CHECK-CVT-GI-NEXT:    fcmgt v0.4s, v0.4s, v1.4s
+; CHECK-CVT-GI-NEXT:    xtn v0.4h, v0.4s
+; CHECK-CVT-GI-NEXT:    ret
 
   %1 = fcmp ogt <4 x half> %a, %b
   ret <4 x i1> %1
 }
 
 define <4 x i1> @test_fcmp_oge(<4 x half> %a, <4 x half> %b) #0 {
-; CHECK-CVT-LABEL: test_fcmp_oge:
-; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    fcvtl v1.4s, v1.4h
-; CHECK-CVT-NEXT:    fcvtl v0.4s, v0.4h
-; CHECK-CVT-NEXT:    fcmge v0.4s, v0.4s, v1.4s
-; CHECK-CVT-NEXT:    xtn v0.4h, v0.4s
-; CHECK-CVT-NEXT:    ret
+; CHECK-CVT-SD-LABEL: test_fcmp_oge:
+; CHECK-CVT-SD:       // %bb.0:
+; CHECK-CVT-SD-NEXT:    fcvtl v1.4s, v1.4h
+; CHECK-CVT-SD-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-CVT-SD-NEXT:    fcmge v0.4s, v0.4s, v1.4s
+; CHECK-CVT-SD-NEXT:    xtn v0.4h, v0.4s
+; CHECK-CVT-SD-NEXT:    ret
 ;
 ; CHECK-FP16-LABEL: test_fcmp_oge:
 ; CHECK-FP16:       // %bb.0:
 ; CHECK-FP16-NEXT:    fcmge v0.4h, v0.4h, v1.4h
 ; CHECK-FP16-NEXT:    ret
+;
+; CHECK-CVT-GI-LABEL: test_fcmp_oge:
+; CHECK-CVT-GI:       // %bb.0:
+; CHECK-CVT-GI-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-CVT-GI-NEXT:    fcvtl v1.4s, v1.4h
+; CHECK-CVT-GI-NEXT:    fcmge v0.4s, v0.4s, v1.4s
+; CHECK-CVT-GI-NEXT:    xtn v0.4h, v0.4s
+; CHECK-CVT-GI-NEXT:    ret
 
   %1 = fcmp oge <4 x half> %a, %b
   ret <4 x i1> %1
@@ -624,15 +884,15 @@ define <4 x i1> @test_fcmp_ole(<4 x half> %a, <4 x half> %b) #0 {
 }
 
 define <4 x i1> @test_fcmp_ord(<4 x half> %a, <4 x half> %b) #0 {
-; CHECK-CVT-LABEL: test_fcmp_ord:
-; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    fcvtl v1.4s, v1.4h
-; CHECK-CVT-NEXT:    fcvtl v0.4s, v0.4h
-; CHECK-CVT-NEXT:    fcmge v2.4s, v0.4s, v1.4s
-; CHECK-CVT-NEXT:    fcmgt v0.4s, v1.4s, v0.4s
-; CHECK-CVT-NEXT:    orr v0.16b, v0.16b, v2.16b
-; CHECK-CVT-NEXT:    xtn v0.4h, v0.4s
-; CHECK-CVT-NEXT:    ret
+; CHECK-CVT-SD-LABEL: test_fcmp_ord:
+; CHECK-CVT-SD:       // %bb.0:
+; CHECK-CVT-SD-NEXT:    fcvtl v1.4s, v1.4h
+; CHECK-CVT-SD-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-CVT-SD-NEXT:    fcmge v2.4s, v0.4s, v1.4s
+; CHECK-CVT-SD-NEXT:    fcmgt v0.4s, v1.4s, v0.4s
+; CHECK-CVT-SD-NEXT:    orr v0.16b, v0.16b, v2.16b
+; CHECK-CVT-SD-NEXT:    xtn v0.4h, v0.4s
+; CHECK-CVT-SD-NEXT:    ret
 ;
 ; CHECK-FP16-LABEL: test_fcmp_ord:
 ; CHECK-FP16:       // %bb.0:
@@ -640,6 +900,16 @@ define <4 x i1> @test_fcmp_ord(<4 x half> %a, <4 x half> %b) #0 {
 ; CHECK-FP16-NEXT:    fcmgt v0.4h, v1.4h, v0.4h
 ; CHECK-FP16-NEXT:    orr v0.8b, v0.8b, v2.8b
 ; CHECK-FP16-NEXT:    ret
+;
+; CHECK-CVT-GI-LABEL: test_fcmp_ord:
+; CHECK-CVT-GI:       // %bb.0:
+; CHECK-CVT-GI-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-CVT-GI-NEXT:    fcvtl v1.4s, v1.4h
+; CHECK-CVT-GI-NEXT:    fcmge v2.4s, v0.4s, v1.4s
+; CHECK-CVT-GI-NEXT:    fcmgt v0.4s, v1.4s, v0.4s
+; CHECK-CVT-GI-NEXT:    orr v0.16b, v0.16b, v2.16b
+; CHECK-CVT-GI-NEXT:    xtn v0.4h, v0.4s
+; CHECK-CVT-GI-NEXT:    ret
 
   %1 = fcmp ord <4 x half> %a, %b
   ret <4 x i1> %1
diff --git a/llvm/test/CodeGen/AArch64/fp16-v8-instructions.ll b/llvm/test/CodeGen/AArch64/fp16-v8-instructions.ll
index fcb42a7..86763eb 100644
--- a/llvm/test/CodeGen/AArch64/fp16-v8-instructions.ll
+++ b/llvm/test/CodeGen/AArch64/fp16-v8-instructions.ll
@@ -1,24 +1,38 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=aarch64 -mattr=-fullfp16 | FileCheck %s --check-prefixes=CHECK,CHECK-CVT
-; RUN: llc < %s -mtriple=aarch64 -mattr=+fullfp16 | FileCheck %s --check-prefixes=CHECK,CHECK-FP16
+; RUN: llc < %s -mtriple=aarch64 -mattr=-fullfp16 | FileCheck %s --check-prefixes=CHECK,CHECK-CVT,CHECK-CVT-SD
+; RUN: llc < %s -mtriple=aarch64 -mattr=+fullfp16 | FileCheck %s --check-prefixes=CHECK,CHECK-FP16,CHECK-FP16-SD
+; RUN: llc < %s -mtriple=aarch64 -mattr=-fullfp16 -global-isel | FileCheck %s --check-prefixes=CHECK,CHECK-CVT,CHECK-CVT-GI
+; RUN: llc < %s -mtriple=aarch64 -mattr=+fullfp16 -global-isel | FileCheck %s --check-prefixes=CHECK,CHECK-FP16,CHECK-FP16-GI
 
 define <8 x half> @add_h(<8 x half> %a, <8 x half> %b) {
-; CHECK-CVT-LABEL: add_h:
-; CHECK-CVT:       // %bb.0: // %entry
-; CHECK-CVT-NEXT:    fcvtl v2.4s, v1.4h
-; CHECK-CVT-NEXT:    fcvtl v3.4s, v0.4h
-; CHECK-CVT-NEXT:    fcvtl2 v1.4s, v1.8h
-; CHECK-CVT-NEXT:    fcvtl2 v0.4s, v0.8h
-; CHECK-CVT-NEXT:    fadd v2.4s, v3.4s, v2.4s
-; CHECK-CVT-NEXT:    fadd v1.4s, v0.4s, v1.4s
-; CHECK-CVT-NEXT:    fcvtn v0.4h, v2.4s
-; CHECK-CVT-NEXT:    fcvtn2 v0.8h, v1.4s
-; CHECK-CVT-NEXT:    ret
+; CHECK-CVT-SD-LABEL: add_h:
+; CHECK-CVT-SD:       // %bb.0: // %entry
+; CHECK-CVT-SD-NEXT:    fcvtl v2.4s, v1.4h
+; CHECK-CVT-SD-NEXT:    fcvtl v3.4s, v0.4h
+; CHECK-CVT-SD-NEXT:    fcvtl2 v1.4s, v1.8h
+; CHECK-CVT-SD-NEXT:    fcvtl2 v0.4s, v0.8h
+; CHECK-CVT-SD-NEXT:    fadd v2.4s, v3.4s, v2.4s
+; CHECK-CVT-SD-NEXT:    fadd v1.4s, v0.4s, v1.4s
+; CHECK-CVT-SD-NEXT:    fcvtn v0.4h, v2.4s
+; CHECK-CVT-SD-NEXT:    fcvtn2 v0.8h, v1.4s
+; CHECK-CVT-SD-NEXT:    ret
 ;
 ; CHECK-FP16-LABEL: add_h:
 ; CHECK-FP16:       // %bb.0: // %entry
 ; CHECK-FP16-NEXT:    fadd v0.8h, v0.8h, v1.8h
 ; CHECK-FP16-NEXT:    ret
+;
+; CHECK-CVT-GI-LABEL: add_h:
+; CHECK-CVT-GI:       // %bb.0: // %entry
+; CHECK-CVT-GI-NEXT:    fcvtl v2.4s, v0.4h
+; CHECK-CVT-GI-NEXT:    fcvtl v3.4s, v1.4h
+; CHECK-CVT-GI-NEXT:    fcvtl2 v0.4s, v0.8h
+; CHECK-CVT-GI-NEXT:    fcvtl2 v1.4s, v1.8h
+; CHECK-CVT-GI-NEXT:    fadd v2.4s, v2.4s, v3.4s
+; CHECK-CVT-GI-NEXT:    fadd v1.4s, v0.4s, v1.4s
+; CHECK-CVT-GI-NEXT:    fcvtn v0.4h, v2.4s
+; CHECK-CVT-GI-NEXT:    fcvtn2 v0.8h, v1.4s
+; CHECK-CVT-GI-NEXT:    ret
 entry:
   %0 = fadd <8 x half> %a, %b
   ret <8 x half> %0
@@ -26,22 +40,34 @@ entry:
 
 
 define <8 x half> @sub_h(<8 x half> %a, <8 x half> %b) {
-; CHECK-CVT-LABEL: sub_h:
-; CHECK-CVT:       // %bb.0: // %entry
-; CHECK-CVT-NEXT:    fcvtl v2.4s, v1.4h
-; CHECK-CVT-NEXT:    fcvtl v3.4s, v0.4h
-; CHECK-CVT-NEXT:    fcvtl2 v1.4s, v1.8h
-; CHECK-CVT-NEXT:    fcvtl2 v0.4s, v0.8h
-; CHECK-CVT-NEXT:    fsub v2.4s, v3.4s, v2.4s
-; CHECK-CVT-NEXT:    fsub v1.4s, v0.4s, v1.4s
-; CHECK-CVT-NEXT:    fcvtn v0.4h, v2.4s
-; CHECK-CVT-NEXT:    fcvtn2 v0.8h, v1.4s
-; CHECK-CVT-NEXT:    ret
+; CHECK-CVT-SD-LABEL: sub_h:
+; CHECK-CVT-SD:       // %bb.0: // %entry
+; CHECK-CVT-SD-NEXT:    fcvtl v2.4s, v1.4h
+; CHECK-CVT-SD-NEXT:    fcvtl v3.4s, v0.4h
+; CHECK-CVT-SD-NEXT:    fcvtl2 v1.4s, v1.8h
+; CHECK-CVT-SD-NEXT:    fcvtl2 v0.4s, v0.8h
+; CHECK-CVT-SD-NEXT:    fsub v2.4s, v3.4s, v2.4s
+; CHECK-CVT-SD-NEXT:    fsub v1.4s, v0.4s, v1.4s
+; CHECK-CVT-SD-NEXT:    fcvtn v0.4h, v2.4s
+; CHECK-CVT-SD-NEXT:    fcvtn2 v0.8h, v1.4s
+; CHECK-CVT-SD-NEXT:    ret
 ;
 ; CHECK-FP16-LABEL: sub_h:
 ; CHECK-FP16:       // %bb.0: // %entry
 ; CHECK-FP16-NEXT:    fsub v0.8h, v0.8h, v1.8h
 ; CHECK-FP16-NEXT:    ret
+;
+; CHECK-CVT-GI-LABEL: sub_h:
+; CHECK-CVT-GI:       // %bb.0: // %entry
+; CHECK-CVT-GI-NEXT:    fcvtl v2.4s, v0.4h
+; CHECK-CVT-GI-NEXT:    fcvtl v3.4s, v1.4h
+; CHECK-CVT-GI-NEXT:    fcvtl2 v0.4s, v0.8h
+; CHECK-CVT-GI-NEXT:    fcvtl2 v1.4s, v1.8h
+; CHECK-CVT-GI-NEXT:    fsub v2.4s, v2.4s, v3.4s
+; CHECK-CVT-GI-NEXT:    fsub v1.4s, v0.4s, v1.4s
+; CHECK-CVT-GI-NEXT:    fcvtn v0.4h, v2.4s
+; CHECK-CVT-GI-NEXT:    fcvtn2 v0.8h, v1.4s
+; CHECK-CVT-GI-NEXT:    ret
 entry:
   %0 = fsub <8 x half> %a, %b
   ret <8 x half> %0
@@ -49,22 +75,34 @@ entry:
 
 
 define <8 x half> @mul_h(<8 x half> %a, <8 x half> %b) {
-; CHECK-CVT-LABEL: mul_h:
-; CHECK-CVT:       // %bb.0: // %entry
-; CHECK-CVT-NEXT:    fcvtl v2.4s, v1.4h
-; CHECK-CVT-NEXT:    fcvtl v3.4s, v0.4h
-; CHECK-CVT-NEXT:    fcvtl2 v1.4s, v1.8h
-; CHECK-CVT-NEXT:    fcvtl2 v0.4s, v0.8h
-; CHECK-CVT-NEXT:    fmul v2.4s, v3.4s, v2.4s
-; CHECK-CVT-NEXT:    fmul v1.4s, v0.4s, v1.4s
-; CHECK-CVT-NEXT:    fcvtn v0.4h, v2.4s
-; CHECK-CVT-NEXT:    fcvtn2 v0.8h, v1.4s
-; CHECK-CVT-NEXT:    ret
+; CHECK-CVT-SD-LABEL: mul_h:
+; CHECK-CVT-SD:       // %bb.0: // %entry
+; CHECK-CVT-SD-NEXT:    fcvtl v2.4s, v1.4h
+; CHECK-CVT-SD-NEXT:    fcvtl v3.4s, v0.4h
+; CHECK-CVT-SD-NEXT:    fcvtl2 v1.4s, v1.8h
+; CHECK-CVT-SD-NEXT:    fcvtl2 v0.4s, v0.8h
+; CHECK-CVT-SD-NEXT:    fmul v2.4s, v3.4s, v2.4s
+; CHECK-CVT-SD-NEXT:    fmul v1.4s, v0.4s, v1.4s
+; CHECK-CVT-SD-NEXT:    fcvtn v0.4h, v2.4s
+; CHECK-CVT-SD-NEXT:    fcvtn2 v0.8h, v1.4s
+; CHECK-CVT-SD-NEXT:    ret
 ;
 ; CHECK-FP16-LABEL: mul_h:
 ; CHECK-FP16:       // %bb.0: // %entry
 ; CHECK-FP16-NEXT:    fmul v0.8h, v0.8h, v1.8h
 ; CHECK-FP16-NEXT:    ret
+;
+; CHECK-CVT-GI-LABEL: mul_h:
+; CHECK-CVT-GI:       // %bb.0: // %entry
+; CHECK-CVT-GI-NEXT:    fcvtl v2.4s, v0.4h
+; CHECK-CVT-GI-NEXT:    fcvtl v3.4s, v1.4h
+; CHECK-CVT-GI-NEXT:    fcvtl2 v0.4s, v0.8h
+; CHECK-CVT-GI-NEXT:    fcvtl2 v1.4s, v1.8h
+; CHECK-CVT-GI-NEXT:    fmul v2.4s, v2.4s, v3.4s
+; CHECK-CVT-GI-NEXT:    fmul v1.4s, v0.4s, v1.4s
+; CHECK-CVT-GI-NEXT:    fcvtn v0.4h, v2.4s
+; CHECK-CVT-GI-NEXT:    fcvtn2 v0.8h, v1.4s
+; CHECK-CVT-GI-NEXT:    ret
 entry:
   %0 = fmul <8 x half> %a, %b
   ret <8 x half> %0
@@ -72,22 +110,34 @@ entry:
 
 
 define <8 x half> @div_h(<8 x half> %a, <8 x half> %b) {
-; CHECK-CVT-LABEL: div_h:
-; CHECK-CVT:       // %bb.0: // %entry
-; CHECK-CVT-NEXT:    fcvtl v2.4s, v1.4h
-; CHECK-CVT-NEXT:    fcvtl v3.4s, v0.4h
-; CHECK-CVT-NEXT:    fcvtl2 v1.4s, v1.8h
-; CHECK-CVT-NEXT:    fcvtl2 v0.4s, v0.8h
-; CHECK-CVT-NEXT:    fdiv v2.4s, v3.4s, v2.4s
-; CHECK-CVT-NEXT:    fdiv v1.4s, v0.4s, v1.4s
-; CHECK-CVT-NEXT:    fcvtn v0.4h, v2.4s
-; CHECK-CVT-NEXT:    fcvtn2 v0.8h, v1.4s
-; CHECK-CVT-NEXT:    ret
+; CHECK-CVT-SD-LABEL: div_h:
+; CHECK-CVT-SD:       // %bb.0: // %entry
+; CHECK-CVT-SD-NEXT:    fcvtl v2.4s, v1.4h
+; CHECK-CVT-SD-NEXT:    fcvtl v3.4s, v0.4h
+; CHECK-CVT-SD-NEXT:    fcvtl2 v1.4s, v1.8h
+; CHECK-CVT-SD-NEXT:    fcvtl2 v0.4s, v0.8h
+; CHECK-CVT-SD-NEXT:    fdiv v2.4s, v3.4s, v2.4s
+; CHECK-CVT-SD-NEXT:    fdiv v1.4s, v0.4s, v1.4s
+; CHECK-CVT-SD-NEXT:    fcvtn v0.4h, v2.4s
+; CHECK-CVT-SD-NEXT:    fcvtn2 v0.8h, v1.4s
+; CHECK-CVT-SD-NEXT:    ret
 ;
 ; CHECK-FP16-LABEL: div_h:
 ; CHECK-FP16:       // %bb.0: // %entry
 ; CHECK-FP16-NEXT:    fdiv v0.8h, v0.8h, v1.8h
 ; CHECK-FP16-NEXT:    ret
+;
+; CHECK-CVT-GI-LABEL: div_h:
+; CHECK-CVT-GI:       // %bb.0: // %entry
+; CHECK-CVT-GI-NEXT:    fcvtl v2.4s, v0.4h
+; CHECK-CVT-GI-NEXT:    fcvtl v3.4s, v1.4h
+; CHECK-CVT-GI-NEXT:    fcvtl2 v0.4s, v0.8h
+; CHECK-CVT-GI-NEXT:    fcvtl2 v1.4s, v1.8h
+; CHECK-CVT-GI-NEXT:    fdiv v2.4s, v2.4s, v3.4s
+; CHECK-CVT-GI-NEXT:    fdiv v1.4s, v0.4s, v1.4s
+; CHECK-CVT-GI-NEXT:    fcvtn v0.4h, v2.4s
+; CHECK-CVT-GI-NEXT:    fcvtn2 v0.8h, v1.4s
+; CHECK-CVT-GI-NEXT:    ret
 entry:
   %0 = fdiv <8 x half> %a, %b
   ret <8 x half> %0
@@ -126,39 +176,171 @@ define <8 x half> @s_to_h(<8 x float> %a) {
 }
 
 define <8 x half> @d_to_h(<8 x double> %a) {
-; CHECK-LABEL: d_to_h:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    fcvtxn v0.2s, v0.2d
-; CHECK-NEXT:    fcvtxn v2.2s, v2.2d
-; CHECK-NEXT:    fcvtxn2 v0.4s, v1.2d
-; CHECK-NEXT:    fcvtxn2 v2.4s, v3.2d
-; CHECK-NEXT:    fcvtn v0.4h, v0.4s
-; CHECK-NEXT:    fcvtn2 v0.8h, v2.4s
-; CHECK-NEXT:    ret
+; CHECK-CVT-SD-LABEL: d_to_h:
+; CHECK-CVT-SD:       // %bb.0:
+; CHECK-CVT-SD-NEXT:    fcvtxn v0.2s, v0.2d
+; CHECK-CVT-SD-NEXT:    fcvtxn v2.2s, v2.2d
+; CHECK-CVT-SD-NEXT:    fcvtxn2 v0.4s, v1.2d
+; CHECK-CVT-SD-NEXT:    fcvtxn2 v2.4s, v3.2d
+; CHECK-CVT-SD-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-CVT-SD-NEXT:    fcvtn2 v0.8h, v2.4s
+; CHECK-CVT-SD-NEXT:    ret
+;
+; CHECK-FP16-SD-LABEL: d_to_h:
+; CHECK-FP16-SD:       // %bb.0:
+; CHECK-FP16-SD-NEXT:    fcvtxn v0.2s, v0.2d
+; CHECK-FP16-SD-NEXT:    fcvtxn v2.2s, v2.2d
+; CHECK-FP16-SD-NEXT:    fcvtxn2 v0.4s, v1.2d
+; CHECK-FP16-SD-NEXT:    fcvtxn2 v2.4s, v3.2d
+; CHECK-FP16-SD-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-FP16-SD-NEXT:    fcvtn2 v0.8h, v2.4s
+; CHECK-FP16-SD-NEXT:    ret
+;
+; CHECK-CVT-GI-LABEL: d_to_h:
+; CHECK-CVT-GI:       // %bb.0:
+; CHECK-CVT-GI-NEXT:    mov d4, v0.d[1]
+; CHECK-CVT-GI-NEXT:    fcvt h0, d0
+; CHECK-CVT-GI-NEXT:    mov d5, v1.d[1]
+; CHECK-CVT-GI-NEXT:    fcvt h1, d1
+; CHECK-CVT-GI-NEXT:    fcvt h4, d4
+; CHECK-CVT-GI-NEXT:    mov v0.h[1], v4.h[0]
+; CHECK-CVT-GI-NEXT:    fcvt h4, d5
+; CHECK-CVT-GI-NEXT:    mov v0.h[2], v1.h[0]
+; CHECK-CVT-GI-NEXT:    mov d1, v2.d[1]
+; CHECK-CVT-GI-NEXT:    fcvt h2, d2
+; CHECK-CVT-GI-NEXT:    mov v0.h[3], v4.h[0]
+; CHECK-CVT-GI-NEXT:    fcvt h1, d1
+; CHECK-CVT-GI-NEXT:    mov v0.h[4], v2.h[0]
+; CHECK-CVT-GI-NEXT:    mov d2, v3.d[1]
+; CHECK-CVT-GI-NEXT:    fcvt h3, d3
+; CHECK-CVT-GI-NEXT:    mov v0.h[5], v1.h[0]
+; CHECK-CVT-GI-NEXT:    fcvt h1, d2
+; CHECK-CVT-GI-NEXT:    mov v0.h[6], v3.h[0]
+; CHECK-CVT-GI-NEXT:    mov v0.h[7], v1.h[0]
+; CHECK-CVT-GI-NEXT:    ret
+;
+; CHECK-FP16-GI-LABEL: d_to_h:
+; CHECK-FP16-GI:       // %bb.0:
+; CHECK-FP16-GI-NEXT:    mov d4, v0.d[1]
+; CHECK-FP16-GI-NEXT:    fcvt h0, d0
+; CHECK-FP16-GI-NEXT:    mov d5, v1.d[1]
+; CHECK-FP16-GI-NEXT:    fcvt h1, d1
+; CHECK-FP16-GI-NEXT:    fcvt h4, d4
+; CHECK-FP16-GI-NEXT:    mov v0.h[1], v4.h[0]
+; CHECK-FP16-GI-NEXT:    fcvt h4, d5
+; CHECK-FP16-GI-NEXT:    mov v0.h[2], v1.h[0]
+; CHECK-FP16-GI-NEXT:    mov d1, v2.d[1]
+; CHECK-FP16-GI-NEXT:    fcvt h2, d2
+; CHECK-FP16-GI-NEXT:    mov v0.h[3], v4.h[0]
+; CHECK-FP16-GI-NEXT:    fcvt h1, d1
+; CHECK-FP16-GI-NEXT:    mov v0.h[4], v2.h[0]
+; CHECK-FP16-GI-NEXT:    mov d2, v3.d[1]
+; CHECK-FP16-GI-NEXT:    fcvt h3, d3
+; CHECK-FP16-GI-NEXT:    mov v0.h[5], v1.h[0]
+; CHECK-FP16-GI-NEXT:    fcvt h1, d2
+; CHECK-FP16-GI-NEXT:    mov v0.h[6], v3.h[0]
+; CHECK-FP16-GI-NEXT:    mov v0.h[7], v1.h[0]
+; CHECK-FP16-GI-NEXT:    ret
   %1 = fptrunc <8 x double> %a to <8 x half>
   ret <8 x half> %1
 }
 
 define <8 x float> @h_to_s(<8 x half> %a) {
-; CHECK-LABEL: h_to_s:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    fcvtl2 v1.4s, v0.8h
-; CHECK-NEXT:    fcvtl v0.4s, v0.4h
-; CHECK-NEXT:    ret
+; CHECK-CVT-SD-LABEL: h_to_s:
+; CHECK-CVT-SD:       // %bb.0:
+; CHECK-CVT-SD-NEXT:    fcvtl2 v1.4s, v0.8h
+; CHECK-CVT-SD-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-CVT-SD-NEXT:    ret
+;
+; CHECK-FP16-SD-LABEL: h_to_s:
+; CHECK-FP16-SD:       // %bb.0:
+; CHECK-FP16-SD-NEXT:    fcvtl2 v1.4s, v0.8h
+; CHECK-FP16-SD-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-FP16-SD-NEXT:    ret
+;
+; CHECK-CVT-GI-LABEL: h_to_s:
+; CHECK-CVT-GI:       // %bb.0:
+; CHECK-CVT-GI-NEXT:    fcvtl v2.4s, v0.4h
+; CHECK-CVT-GI-NEXT:    fcvtl2 v1.4s, v0.8h
+; CHECK-CVT-GI-NEXT:    mov v0.16b, v2.16b
+; CHECK-CVT-GI-NEXT:    ret
+;
+; CHECK-FP16-GI-LABEL: h_to_s:
+; CHECK-FP16-GI:       // %bb.0:
+; CHECK-FP16-GI-NEXT:    fcvtl v2.4s, v0.4h
+; CHECK-FP16-GI-NEXT:    fcvtl2 v1.4s, v0.8h
+; CHECK-FP16-GI-NEXT:    mov v0.16b, v2.16b
+; CHECK-FP16-GI-NEXT:    ret
   %1 = fpext <8 x half> %a to <8 x float>
   ret <8 x float> %1
 }
 
 define <8 x double> @h_to_d(<8 x half> %a) {
-; CHECK-LABEL: h_to_d:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    fcvtl v1.4s, v0.4h
-; CHECK-NEXT:    fcvtl2 v2.4s, v0.8h
-; CHECK-NEXT:    fcvtl v0.2d, v1.2s
-; CHECK-NEXT:    fcvtl2 v3.2d, v2.4s
-; CHECK-NEXT:    fcvtl2 v1.2d, v1.4s
-; CHECK-NEXT:    fcvtl v2.2d, v2.2s
-; CHECK-NEXT:    ret
+; CHECK-CVT-SD-LABEL: h_to_d:
+; CHECK-CVT-SD:       // %bb.0:
+; CHECK-CVT-SD-NEXT:    fcvtl v1.4s, v0.4h
+; CHECK-CVT-SD-NEXT:    fcvtl2 v2.4s, v0.8h
+; CHECK-CVT-SD-NEXT:    fcvtl v0.2d, v1.2s
+; CHECK-CVT-SD-NEXT:    fcvtl2 v3.2d, v2.4s
+; CHECK-CVT-SD-NEXT:    fcvtl2 v1.2d, v1.4s
+; CHECK-CVT-SD-NEXT:    fcvtl v2.2d, v2.2s
+; CHECK-CVT-SD-NEXT:    ret
+;
+; CHECK-FP16-SD-LABEL: h_to_d:
+; CHECK-FP16-SD:       // %bb.0:
+; CHECK-FP16-SD-NEXT:    fcvtl v1.4s, v0.4h
+; CHECK-FP16-SD-NEXT:    fcvtl2 v2.4s, v0.8h
+; CHECK-FP16-SD-NEXT:    fcvtl v0.2d, v1.2s
+; CHECK-FP16-SD-NEXT:    fcvtl2 v3.2d, v2.4s
+; CHECK-FP16-SD-NEXT:    fcvtl2 v1.2d, v1.4s
+; CHECK-FP16-SD-NEXT:    fcvtl v2.2d, v2.2s
+; CHECK-FP16-SD-NEXT:    ret
+;
+; CHECK-CVT-GI-LABEL: h_to_d:
+; CHECK-CVT-GI:       // %bb.0:
+; CHECK-CVT-GI-NEXT:    mov h1, v0.h[1]
+; CHECK-CVT-GI-NEXT:    mov h2, v0.h[2]
+; CHECK-CVT-GI-NEXT:    mov h3, v0.h[3]
+; CHECK-CVT-GI-NEXT:    mov h4, v0.h[4]
+; CHECK-CVT-GI-NEXT:    mov h5, v0.h[5]
+; CHECK-CVT-GI-NEXT:    mov h6, v0.h[6]
+; CHECK-CVT-GI-NEXT:    mov h7, v0.h[7]
+; CHECK-CVT-GI-NEXT:    fcvt d0, h0
+; CHECK-CVT-GI-NEXT:    fcvt d16, h1
+; CHECK-CVT-GI-NEXT:    fcvt d1, h2
+; CHECK-CVT-GI-NEXT:    fcvt d17, h3
+; CHECK-CVT-GI-NEXT:    fcvt d2, h4
+; CHECK-CVT-GI-NEXT:    fcvt d4, h5
+; CHECK-CVT-GI-NEXT:    fcvt d3, h6
+; CHECK-CVT-GI-NEXT:    fcvt d5, h7
+; CHECK-CVT-GI-NEXT:    mov v0.d[1], v16.d[0]
+; CHECK-CVT-GI-NEXT:    mov v1.d[1], v17.d[0]
+; CHECK-CVT-GI-NEXT:    mov v2.d[1], v4.d[0]
+; CHECK-CVT-GI-NEXT:    mov v3.d[1], v5.d[0]
+; CHECK-CVT-GI-NEXT:    ret
+;
+; CHECK-FP16-GI-LABEL: h_to_d:
+; CHECK-FP16-GI:       // %bb.0:
+; CHECK-FP16-GI-NEXT:    mov h1, v0.h[1]
+; CHECK-FP16-GI-NEXT:    mov h2, v0.h[2]
+; CHECK-FP16-GI-NEXT:    mov h3, v0.h[3]
+; CHECK-FP16-GI-NEXT:    mov h4, v0.h[4]
+; CHECK-FP16-GI-NEXT:    mov h5, v0.h[5]
+; CHECK-FP16-GI-NEXT:    mov h6, v0.h[6]
+; CHECK-FP16-GI-NEXT:    mov h7, v0.h[7]
+; CHECK-FP16-GI-NEXT:    fcvt d0, h0
+; CHECK-FP16-GI-NEXT:    fcvt d16, h1
+; CHECK-FP16-GI-NEXT:    fcvt d1, h2
+; CHECK-FP16-GI-NEXT:    fcvt d17, h3
+; CHECK-FP16-GI-NEXT:    fcvt d2, h4
+; CHECK-FP16-GI-NEXT:    fcvt d4, h5
+; CHECK-FP16-GI-NEXT:    fcvt d3, h6
+; CHECK-FP16-GI-NEXT:    fcvt d5, h7
+; CHECK-FP16-GI-NEXT:    mov v0.d[1], v16.d[0]
+; CHECK-FP16-GI-NEXT:    mov v1.d[1], v17.d[0]
+; CHECK-FP16-GI-NEXT:    mov v2.d[1], v4.d[0]
+; CHECK-FP16-GI-NEXT:    mov v3.d[1], v5.d[0]
+; CHECK-FP16-GI-NEXT:    ret
   %1 = fpext <8 x half> %a to <8 x double>
   ret <8 x double> %1
 }
@@ -183,14 +365,14 @@ define <8 x i16> @bitcast_h_to_i(float, <8 x half> %a) {
 }
 
 define <4 x half> @sitofp_v4i8(<4 x i8> %a) #0 {
-; CHECK-CVT-LABEL: sitofp_v4i8:
-; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    shl v0.4h, v0.4h, #8
-; CHECK-CVT-NEXT:    sshr v0.4h, v0.4h, #8
-; CHECK-CVT-NEXT:    sshll v0.4s, v0.4h, #0
-; CHECK-CVT-NEXT:    scvtf v0.4s, v0.4s
-; CHECK-CVT-NEXT:    fcvtn v0.4h, v0.4s
-; CHECK-CVT-NEXT:    ret
+; CHECK-CVT-SD-LABEL: sitofp_v4i8:
+; CHECK-CVT-SD:       // %bb.0:
+; CHECK-CVT-SD-NEXT:    shl v0.4h, v0.4h, #8
+; CHECK-CVT-SD-NEXT:    sshr v0.4h, v0.4h, #8
+; CHECK-CVT-SD-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-CVT-SD-NEXT:    scvtf v0.4s, v0.4s
+; CHECK-CVT-SD-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-CVT-SD-NEXT:    ret
 ;
 ; CHECK-FP16-LABEL: sitofp_v4i8:
 ; CHECK-FP16:       // %bb.0:
@@ -198,76 +380,132 @@ define <4 x half> @sitofp_v4i8(<4 x i8> %a) #0 {
 ; CHECK-FP16-NEXT:    sshr v0.4h, v0.4h, #8
 ; CHECK-FP16-NEXT:    scvtf v0.4h, v0.4h
 ; CHECK-FP16-NEXT:    ret
+;
+; CHECK-CVT-GI-LABEL: sitofp_v4i8:
+; CHECK-CVT-GI:       // %bb.0:
+; CHECK-CVT-GI-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-CVT-GI-NEXT:    shl v0.4s, v0.4s, #24
+; CHECK-CVT-GI-NEXT:    sshr v0.4s, v0.4s, #24
+; CHECK-CVT-GI-NEXT:    scvtf v0.4s, v0.4s
+; CHECK-CVT-GI-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-CVT-GI-NEXT:    ret
   %1 = sitofp <4 x i8> %a to <4 x half>
   ret <4 x half> %1
 }
 
 define <8 x half> @sitofp_v8i8(<8 x i8> %a) #0 {
-; CHECK-CVT-LABEL: sitofp_v8i8:
-; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-CVT-NEXT:    sshll v1.4s, v0.4h, #0
-; CHECK-CVT-NEXT:    sshll2 v2.4s, v0.8h, #0
-; CHECK-CVT-NEXT:    scvtf v1.4s, v1.4s
-; CHECK-CVT-NEXT:    fcvtn v0.4h, v1.4s
-; CHECK-CVT-NEXT:    scvtf v1.4s, v2.4s
-; CHECK-CVT-NEXT:    fcvtn2 v0.8h, v1.4s
-; CHECK-CVT-NEXT:    ret
+; CHECK-CVT-SD-LABEL: sitofp_v8i8:
+; CHECK-CVT-SD:       // %bb.0:
+; CHECK-CVT-SD-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-CVT-SD-NEXT:    sshll v1.4s, v0.4h, #0
+; CHECK-CVT-SD-NEXT:    sshll2 v2.4s, v0.8h, #0
+; CHECK-CVT-SD-NEXT:    scvtf v1.4s, v1.4s
+; CHECK-CVT-SD-NEXT:    fcvtn v0.4h, v1.4s
+; CHECK-CVT-SD-NEXT:    scvtf v1.4s, v2.4s
+; CHECK-CVT-SD-NEXT:    fcvtn2 v0.8h, v1.4s
+; CHECK-CVT-SD-NEXT:    ret
 ;
 ; CHECK-FP16-LABEL: sitofp_v8i8:
 ; CHECK-FP16:       // %bb.0:
 ; CHECK-FP16-NEXT:    sshll v0.8h, v0.8b, #0
 ; CHECK-FP16-NEXT:    scvtf v0.8h, v0.8h
 ; CHECK-FP16-NEXT:    ret
+;
+; CHECK-CVT-GI-LABEL: sitofp_v8i8:
+; CHECK-CVT-GI:       // %bb.0:
+; CHECK-CVT-GI-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-CVT-GI-NEXT:    sshll v1.4s, v0.4h, #0
+; CHECK-CVT-GI-NEXT:    sshll2 v0.4s, v0.8h, #0
+; CHECK-CVT-GI-NEXT:    scvtf v1.4s, v1.4s
+; CHECK-CVT-GI-NEXT:    scvtf v2.4s, v0.4s
+; CHECK-CVT-GI-NEXT:    fcvtn v0.4h, v1.4s
+; CHECK-CVT-GI-NEXT:    fcvtn2 v0.8h, v2.4s
+; CHECK-CVT-GI-NEXT:    ret
   %1 = sitofp <8 x i8> %a to <8 x half>
   ret <8 x half> %1
 }
 
 define <16 x half> @sitofp_v16i8(<16 x i8> %a) #0 {
-; CHECK-CVT-LABEL: sitofp_v16i8:
-; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    sshll2 v1.8h, v0.16b, #0
-; CHECK-CVT-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-CVT-NEXT:    sshll v2.4s, v1.4h, #0
-; CHECK-CVT-NEXT:    sshll v3.4s, v0.4h, #0
-; CHECK-CVT-NEXT:    sshll2 v4.4s, v1.8h, #0
-; CHECK-CVT-NEXT:    sshll2 v5.4s, v0.8h, #0
-; CHECK-CVT-NEXT:    scvtf v2.4s, v2.4s
-; CHECK-CVT-NEXT:    scvtf v3.4s, v3.4s
-; CHECK-CVT-NEXT:    fcvtn v1.4h, v2.4s
-; CHECK-CVT-NEXT:    scvtf v2.4s, v4.4s
-; CHECK-CVT-NEXT:    fcvtn v0.4h, v3.4s
-; CHECK-CVT-NEXT:    scvtf v3.4s, v5.4s
-; CHECK-CVT-NEXT:    fcvtn2 v1.8h, v2.4s
-; CHECK-CVT-NEXT:    fcvtn2 v0.8h, v3.4s
-; CHECK-CVT-NEXT:    ret
-;
-; CHECK-FP16-LABEL: sitofp_v16i8:
-; CHECK-FP16:       // %bb.0:
-; CHECK-FP16-NEXT:    sshll2 v1.8h, v0.16b, #0
-; CHECK-FP16-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-FP16-NEXT:    scvtf v1.8h, v1.8h
-; CHECK-FP16-NEXT:    scvtf v0.8h, v0.8h
-; CHECK-FP16-NEXT:    ret
+; CHECK-CVT-SD-LABEL: sitofp_v16i8:
+; CHECK-CVT-SD:       // %bb.0:
+; CHECK-CVT-SD-NEXT:    sshll2 v1.8h, v0.16b, #0
+; CHECK-CVT-SD-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-CVT-SD-NEXT:    sshll v2.4s, v1.4h, #0
+; CHECK-CVT-SD-NEXT:    sshll v3.4s, v0.4h, #0
+; CHECK-CVT-SD-NEXT:    sshll2 v4.4s, v1.8h, #0
+; CHECK-CVT-SD-NEXT:    sshll2 v5.4s, v0.8h, #0
+; CHECK-CVT-SD-NEXT:    scvtf v2.4s, v2.4s
+; CHECK-CVT-SD-NEXT:    scvtf v3.4s, v3.4s
+; CHECK-CVT-SD-NEXT:    fcvtn v1.4h, v2.4s
+; CHECK-CVT-SD-NEXT:    scvtf v2.4s, v4.4s
+; CHECK-CVT-SD-NEXT:    fcvtn v0.4h, v3.4s
+; CHECK-CVT-SD-NEXT:    scvtf v3.4s, v5.4s
+; CHECK-CVT-SD-NEXT:    fcvtn2 v1.8h, v2.4s
+; CHECK-CVT-SD-NEXT:    fcvtn2 v0.8h, v3.4s
+; CHECK-CVT-SD-NEXT:    ret
+;
+; CHECK-FP16-SD-LABEL: sitofp_v16i8:
+; CHECK-FP16-SD:       // %bb.0:
+; CHECK-FP16-SD-NEXT:    sshll2 v1.8h, v0.16b, #0
+; CHECK-FP16-SD-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-FP16-SD-NEXT:    scvtf v1.8h, v1.8h
+; CHECK-FP16-SD-NEXT:    scvtf v0.8h, v0.8h
+; CHECK-FP16-SD-NEXT:    ret
+;
+; CHECK-CVT-GI-LABEL: sitofp_v16i8:
+; CHECK-CVT-GI:       // %bb.0:
+; CHECK-CVT-GI-NEXT:    sshll v1.8h, v0.8b, #0
+; CHECK-CVT-GI-NEXT:    sshll2 v0.8h, v0.16b, #0
+; CHECK-CVT-GI-NEXT:    sshll v2.4s, v1.4h, #0
+; CHECK-CVT-GI-NEXT:    sshll v3.4s, v0.4h, #0
+; CHECK-CVT-GI-NEXT:    sshll2 v1.4s, v1.8h, #0
+; CHECK-CVT-GI-NEXT:    sshll2 v0.4s, v0.8h, #0
+; CHECK-CVT-GI-NEXT:    scvtf v2.4s, v2.4s
+; CHECK-CVT-GI-NEXT:    scvtf v3.4s, v3.4s
+; CHECK-CVT-GI-NEXT:    scvtf v4.4s, v1.4s
+; CHECK-CVT-GI-NEXT:    scvtf v5.4s, v0.4s
+; CHECK-CVT-GI-NEXT:    fcvtn v0.4h, v2.4s
+; CHECK-CVT-GI-NEXT:    fcvtn v1.4h, v3.4s
+; CHECK-CVT-GI-NEXT:    fcvtn2 v0.8h, v4.4s
+; CHECK-CVT-GI-NEXT:    fcvtn2 v1.8h, v5.4s
+; CHECK-CVT-GI-NEXT:    ret
+;
+; CHECK-FP16-GI-LABEL: sitofp_v16i8:
+; CHECK-FP16-GI:       // %bb.0:
+; CHECK-FP16-GI-NEXT:    sshll v1.8h, v0.8b, #0
+; CHECK-FP16-GI-NEXT:    sshll2 v2.8h, v0.16b, #0
+; CHECK-FP16-GI-NEXT:    scvtf v0.8h, v1.8h
+; CHECK-FP16-GI-NEXT:    scvtf v1.8h, v2.8h
+; CHECK-FP16-GI-NEXT:    ret
   %1 = sitofp <16 x i8> %a to <16 x half>
   ret <16 x half> %1
 }
 
 define <8 x half> @sitofp_i16(<8 x i16> %a) #0 {
-; CHECK-CVT-LABEL: sitofp_i16:
-; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    sshll v1.4s, v0.4h, #0
-; CHECK-CVT-NEXT:    sshll2 v2.4s, v0.8h, #0
-; CHECK-CVT-NEXT:    scvtf v1.4s, v1.4s
-; CHECK-CVT-NEXT:    fcvtn v0.4h, v1.4s
-; CHECK-CVT-NEXT:    scvtf v1.4s, v2.4s
-; CHECK-CVT-NEXT:    fcvtn2 v0.8h, v1.4s
-; CHECK-CVT-NEXT:    ret
+; CHECK-CVT-SD-LABEL: sitofp_i16:
+; CHECK-CVT-SD:       // %bb.0:
+; CHECK-CVT-SD-NEXT:    sshll v1.4s, v0.4h, #0
+; CHECK-CVT-SD-NEXT:    sshll2 v2.4s, v0.8h, #0
+; CHECK-CVT-SD-NEXT:    scvtf v1.4s, v1.4s
+; CHECK-CVT-SD-NEXT:    fcvtn v0.4h, v1.4s
+; CHECK-CVT-SD-NEXT:    scvtf v1.4s, v2.4s
+; CHECK-CVT-SD-NEXT:    fcvtn2 v0.8h, v1.4s
+; CHECK-CVT-SD-NEXT:    ret
 ;
 ; CHECK-FP16-LABEL: sitofp_i16:
 ; CHECK-FP16:       // %bb.0:
 ; CHECK-FP16-NEXT:    scvtf v0.8h, v0.8h
 ; CHECK-FP16-NEXT:    ret
+;
+; CHECK-CVT-GI-LABEL: sitofp_i16:
+; CHECK-CVT-GI:       // %bb.0:
+; CHECK-CVT-GI-NEXT:    sshll v1.4s, v0.4h, #0
+; CHECK-CVT-GI-NEXT:    sshll2 v0.4s, v0.8h, #0
+; CHECK-CVT-GI-NEXT:    scvtf v1.4s, v1.4s
+; CHECK-CVT-GI-NEXT:    scvtf v2.4s, v0.4s
+; CHECK-CVT-GI-NEXT:    fcvtn v0.4h, v1.4s
+; CHECK-CVT-GI-NEXT:    fcvtn2 v0.8h, v2.4s
+; CHECK-CVT-GI-NEXT:    ret
   %1 = sitofp <8 x i16> %a to <8 x half>
   ret <8 x half> %1
 }
@@ -286,108 +524,213 @@ define <8 x half> @sitofp_i32(<8 x i32> %a) #0 {
 
 
 define <8 x half> @sitofp_i64(<8 x i64> %a) #0 {
-; CHECK-LABEL: sitofp_i64:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    scvtf v0.2d, v0.2d
-; CHECK-NEXT:    scvtf v2.2d, v2.2d
-; CHECK-NEXT:    scvtf v1.2d, v1.2d
-; CHECK-NEXT:    scvtf v3.2d, v3.2d
-; CHECK-NEXT:    fcvtn v0.2s, v0.2d
-; CHECK-NEXT:    fcvtn v2.2s, v2.2d
-; CHECK-NEXT:    fcvtn2 v0.4s, v1.2d
-; CHECK-NEXT:    fcvtn2 v2.4s, v3.2d
-; CHECK-NEXT:    fcvtn v0.4h, v0.4s
-; CHECK-NEXT:    fcvtn2 v0.8h, v2.4s
-; CHECK-NEXT:    ret
+; CHECK-CVT-SD-LABEL: sitofp_i64:
+; CHECK-CVT-SD:       // %bb.0:
+; CHECK-CVT-SD-NEXT:    scvtf v0.2d, v0.2d
+; CHECK-CVT-SD-NEXT:    scvtf v2.2d, v2.2d
+; CHECK-CVT-SD-NEXT:    scvtf v1.2d, v1.2d
+; CHECK-CVT-SD-NEXT:    scvtf v3.2d, v3.2d
+; CHECK-CVT-SD-NEXT:    fcvtn v0.2s, v0.2d
+; CHECK-CVT-SD-NEXT:    fcvtn v2.2s, v2.2d
+; CHECK-CVT-SD-NEXT:    fcvtn2 v0.4s, v1.2d
+; CHECK-CVT-SD-NEXT:    fcvtn2 v2.4s, v3.2d
+; CHECK-CVT-SD-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-CVT-SD-NEXT:    fcvtn2 v0.8h, v2.4s
+; CHECK-CVT-SD-NEXT:    ret
+;
+; CHECK-FP16-SD-LABEL: sitofp_i64:
+; CHECK-FP16-SD:       // %bb.0:
+; CHECK-FP16-SD-NEXT:    scvtf v0.2d, v0.2d
+; CHECK-FP16-SD-NEXT:    scvtf v2.2d, v2.2d
+; CHECK-FP16-SD-NEXT:    scvtf v1.2d, v1.2d
+; CHECK-FP16-SD-NEXT:    scvtf v3.2d, v3.2d
+; CHECK-FP16-SD-NEXT:    fcvtn v0.2s, v0.2d
+; CHECK-FP16-SD-NEXT:    fcvtn v2.2s, v2.2d
+; CHECK-FP16-SD-NEXT:    fcvtn2 v0.4s, v1.2d
+; CHECK-FP16-SD-NEXT:    fcvtn2 v2.4s, v3.2d
+; CHECK-FP16-SD-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-FP16-SD-NEXT:    fcvtn2 v0.8h, v2.4s
+; CHECK-FP16-SD-NEXT:    ret
+;
+; CHECK-CVT-GI-LABEL: sitofp_i64:
+; CHECK-CVT-GI:       // %bb.0:
+; CHECK-CVT-GI-NEXT:    scvtf v0.2d, v0.2d
+; CHECK-CVT-GI-NEXT:    scvtf v1.2d, v1.2d
+; CHECK-CVT-GI-NEXT:    scvtf v2.2d, v2.2d
+; CHECK-CVT-GI-NEXT:    scvtf v3.2d, v3.2d
+; CHECK-CVT-GI-NEXT:    fcvtn v0.2s, v0.2d
+; CHECK-CVT-GI-NEXT:    fcvtn v2.2s, v2.2d
+; CHECK-CVT-GI-NEXT:    fcvtn2 v0.4s, v1.2d
+; CHECK-CVT-GI-NEXT:    fcvtn2 v2.4s, v3.2d
+; CHECK-CVT-GI-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-CVT-GI-NEXT:    fcvtn2 v0.8h, v2.4s
+; CHECK-CVT-GI-NEXT:    ret
+;
+; CHECK-FP16-GI-LABEL: sitofp_i64:
+; CHECK-FP16-GI:       // %bb.0:
+; CHECK-FP16-GI-NEXT:    scvtf v0.2d, v0.2d
+; CHECK-FP16-GI-NEXT:    scvtf v1.2d, v1.2d
+; CHECK-FP16-GI-NEXT:    scvtf v2.2d, v2.2d
+; CHECK-FP16-GI-NEXT:    scvtf v3.2d, v3.2d
+; CHECK-FP16-GI-NEXT:    fcvtn v0.2s, v0.2d
+; CHECK-FP16-GI-NEXT:    fcvtn v2.2s, v2.2d
+; CHECK-FP16-GI-NEXT:    fcvtn2 v0.4s, v1.2d
+; CHECK-FP16-GI-NEXT:    fcvtn2 v2.4s, v3.2d
+; CHECK-FP16-GI-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-FP16-GI-NEXT:    fcvtn2 v0.8h, v2.4s
+; CHECK-FP16-GI-NEXT:    ret
   %1 = sitofp <8 x i64> %a to <8 x half>
   ret <8 x half> %1
 }
 
 define <4 x half> @uitofp_v4i8(<4 x i8> %a) #0 {
-; CHECK-CVT-LABEL: uitofp_v4i8:
-; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    bic v0.4h, #255, lsl #8
-; CHECK-CVT-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-CVT-NEXT:    ucvtf v0.4s, v0.4s
-; CHECK-CVT-NEXT:    fcvtn v0.4h, v0.4s
-; CHECK-CVT-NEXT:    ret
-;
-; CHECK-FP16-LABEL: uitofp_v4i8:
-; CHECK-FP16:       // %bb.0:
-; CHECK-FP16-NEXT:    bic v0.4h, #255, lsl #8
-; CHECK-FP16-NEXT:    ucvtf v0.4h, v0.4h
-; CHECK-FP16-NEXT:    ret
+; CHECK-CVT-SD-LABEL: uitofp_v4i8:
+; CHECK-CVT-SD:       // %bb.0:
+; CHECK-CVT-SD-NEXT:    bic v0.4h, #255, lsl #8
+; CHECK-CVT-SD-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-CVT-SD-NEXT:    ucvtf v0.4s, v0.4s
+; CHECK-CVT-SD-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-CVT-SD-NEXT:    ret
+;
+; CHECK-FP16-SD-LABEL: uitofp_v4i8:
+; CHECK-FP16-SD:       // %bb.0:
+; CHECK-FP16-SD-NEXT:    bic v0.4h, #255, lsl #8
+; CHECK-FP16-SD-NEXT:    ucvtf v0.4h, v0.4h
+; CHECK-FP16-SD-NEXT:    ret
+;
+; CHECK-CVT-GI-LABEL: uitofp_v4i8:
+; CHECK-CVT-GI:       // %bb.0:
+; CHECK-CVT-GI-NEXT:    movi v1.2d, #0x0000ff000000ff
+; CHECK-CVT-GI-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-CVT-GI-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-CVT-GI-NEXT:    ucvtf v0.4s, v0.4s
+; CHECK-CVT-GI-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-CVT-GI-NEXT:    ret
+;
+; CHECK-FP16-GI-LABEL: uitofp_v4i8:
+; CHECK-FP16-GI:       // %bb.0:
+; CHECK-FP16-GI-NEXT:    movi d1, #0xff00ff00ff00ff
+; CHECK-FP16-GI-NEXT:    and v0.8b, v0.8b, v1.8b
+; CHECK-FP16-GI-NEXT:    ucvtf v0.4h, v0.4h
+; CHECK-FP16-GI-NEXT:    ret
   %1 = uitofp <4 x i8> %a to <4 x half>
   ret <4 x half> %1
 }
 
 define <8 x half> @uitofp_v8i8(<8 x i8> %a) #0 {
-; CHECK-CVT-LABEL: uitofp_v8i8:
-; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-CVT-NEXT:    ushll v1.4s, v0.4h, #0
-; CHECK-CVT-NEXT:    ushll2 v2.4s, v0.8h, #0
-; CHECK-CVT-NEXT:    ucvtf v1.4s, v1.4s
-; CHECK-CVT-NEXT:    fcvtn v0.4h, v1.4s
-; CHECK-CVT-NEXT:    ucvtf v1.4s, v2.4s
-; CHECK-CVT-NEXT:    fcvtn2 v0.8h, v1.4s
-; CHECK-CVT-NEXT:    ret
+; CHECK-CVT-SD-LABEL: uitofp_v8i8:
+; CHECK-CVT-SD:       // %bb.0:
+; CHECK-CVT-SD-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-CVT-SD-NEXT:    ushll v1.4s, v0.4h, #0
+; CHECK-CVT-SD-NEXT:    ushll2 v2.4s, v0.8h, #0
+; CHECK-CVT-SD-NEXT:    ucvtf v1.4s, v1.4s
+; CHECK-CVT-SD-NEXT:    fcvtn v0.4h, v1.4s
+; CHECK-CVT-SD-NEXT:    ucvtf v1.4s, v2.4s
+; CHECK-CVT-SD-NEXT:    fcvtn2 v0.8h, v1.4s
+; CHECK-CVT-SD-NEXT:    ret
 ;
 ; CHECK-FP16-LABEL: uitofp_v8i8:
 ; CHECK-FP16:       // %bb.0:
 ; CHECK-FP16-NEXT:    ushll v0.8h, v0.8b, #0
 ; CHECK-FP16-NEXT:    ucvtf v0.8h, v0.8h
 ; CHECK-FP16-NEXT:    ret
+;
+; CHECK-CVT-GI-LABEL: uitofp_v8i8:
+; CHECK-CVT-GI:       // %bb.0:
+; CHECK-CVT-GI-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-CVT-GI-NEXT:    ushll v1.4s, v0.4h, #0
+; CHECK-CVT-GI-NEXT:    ushll2 v0.4s, v0.8h, #0
+; CHECK-CVT-GI-NEXT:    ucvtf v1.4s, v1.4s
+; CHECK-CVT-GI-NEXT:    ucvtf v2.4s, v0.4s
+; CHECK-CVT-GI-NEXT:    fcvtn v0.4h, v1.4s
+; CHECK-CVT-GI-NEXT:    fcvtn2 v0.8h, v2.4s
+; CHECK-CVT-GI-NEXT:    ret
   %1 = uitofp <8 x i8> %a to <8 x half>
   ret <8 x half> %1
 }
 
 define <16 x half> @uitofp_v16i8(<16 x i8> %a) #0 {
-; CHECK-CVT-LABEL: uitofp_v16i8:
-; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    ushll2 v1.8h, v0.16b, #0
-; CHECK-CVT-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-CVT-NEXT:    ushll v2.4s, v1.4h, #0
-; CHECK-CVT-NEXT:    ushll v3.4s, v0.4h, #0
-; CHECK-CVT-NEXT:    ushll2 v4.4s, v1.8h, #0
-; CHECK-CVT-NEXT:    ushll2 v5.4s, v0.8h, #0
-; CHECK-CVT-NEXT:    ucvtf v2.4s, v2.4s
-; CHECK-CVT-NEXT:    ucvtf v3.4s, v3.4s
-; CHECK-CVT-NEXT:    fcvtn v1.4h, v2.4s
-; CHECK-CVT-NEXT:    ucvtf v2.4s, v4.4s
-; CHECK-CVT-NEXT:    fcvtn v0.4h, v3.4s
-; CHECK-CVT-NEXT:    ucvtf v3.4s, v5.4s
-; CHECK-CVT-NEXT:    fcvtn2 v1.8h, v2.4s
-; CHECK-CVT-NEXT:    fcvtn2 v0.8h, v3.4s
-; CHECK-CVT-NEXT:    ret
-;
-; CHECK-FP16-LABEL: uitofp_v16i8:
-; CHECK-FP16:       // %bb.0:
-; CHECK-FP16-NEXT:    ushll2 v1.8h, v0.16b, #0
-; CHECK-FP16-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-FP16-NEXT:    ucvtf v1.8h, v1.8h
-; CHECK-FP16-NEXT:    ucvtf v0.8h, v0.8h
-; CHECK-FP16-NEXT:    ret
+; CHECK-CVT-SD-LABEL: uitofp_v16i8:
+; CHECK-CVT-SD:       // %bb.0:
+; CHECK-CVT-SD-NEXT:    ushll2 v1.8h, v0.16b, #0
+; CHECK-CVT-SD-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-CVT-SD-NEXT:    ushll v2.4s, v1.4h, #0
+; CHECK-CVT-SD-NEXT:    ushll v3.4s, v0.4h, #0
+; CHECK-CVT-SD-NEXT:    ushll2 v4.4s, v1.8h, #0
+; CHECK-CVT-SD-NEXT:    ushll2 v5.4s, v0.8h, #0
+; CHECK-CVT-SD-NEXT:    ucvtf v2.4s, v2.4s
+; CHECK-CVT-SD-NEXT:    ucvtf v3.4s, v3.4s
+; CHECK-CVT-SD-NEXT:    fcvtn v1.4h, v2.4s
+; CHECK-CVT-SD-NEXT:    ucvtf v2.4s, v4.4s
+; CHECK-CVT-SD-NEXT:    fcvtn v0.4h, v3.4s
+; CHECK-CVT-SD-NEXT:    ucvtf v3.4s, v5.4s
+; CHECK-CVT-SD-NEXT:    fcvtn2 v1.8h, v2.4s
+; CHECK-CVT-SD-NEXT:    fcvtn2 v0.8h, v3.4s
+; CHECK-CVT-SD-NEXT:    ret
+;
+; CHECK-FP16-SD-LABEL: uitofp_v16i8:
+; CHECK-FP16-SD:       // %bb.0:
+; CHECK-FP16-SD-NEXT:    ushll2 v1.8h, v0.16b, #0
+; CHECK-FP16-SD-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-FP16-SD-NEXT:    ucvtf v1.8h, v1.8h
+; CHECK-FP16-SD-NEXT:    ucvtf v0.8h, v0.8h
+; CHECK-FP16-SD-NEXT:    ret
+;
+; CHECK-CVT-GI-LABEL: uitofp_v16i8:
+; CHECK-CVT-GI:       // %bb.0:
+; CHECK-CVT-GI-NEXT:    ushll v1.8h, v0.8b, #0
+; CHECK-CVT-GI-NEXT:    ushll2 v0.8h, v0.16b, #0
+; CHECK-CVT-GI-NEXT:    ushll v2.4s, v1.4h, #0
+; CHECK-CVT-GI-NEXT:    ushll v3.4s, v0.4h, #0
+; CHECK-CVT-GI-NEXT:    ushll2 v1.4s, v1.8h, #0
+; CHECK-CVT-GI-NEXT:    ushll2 v0.4s, v0.8h, #0
+; CHECK-CVT-GI-NEXT:    ucvtf v2.4s, v2.4s
+; CHECK-CVT-GI-NEXT:    ucvtf v3.4s, v3.4s
+; CHECK-CVT-GI-NEXT:    ucvtf v4.4s, v1.4s
+; CHECK-CVT-GI-NEXT:    ucvtf v5.4s, v0.4s
+; CHECK-CVT-GI-NEXT:    fcvtn v0.4h, v2.4s
+; CHECK-CVT-GI-NEXT:    fcvtn v1.4h, v3.4s
+; CHECK-CVT-GI-NEXT:    fcvtn2 v0.8h, v4.4s
+; CHECK-CVT-GI-NEXT:    fcvtn2 v1.8h, v5.4s
+; CHECK-CVT-GI-NEXT:    ret
+;
+; CHECK-FP16-GI-LABEL: uitofp_v16i8:
+; CHECK-FP16-GI:       // %bb.0:
+; CHECK-FP16-GI-NEXT:    ushll v1.8h, v0.8b, #0
+; CHECK-FP16-GI-NEXT:    ushll2 v2.8h, v0.16b, #0
+; CHECK-FP16-GI-NEXT:    ucvtf v0.8h, v1.8h
+; CHECK-FP16-GI-NEXT:    ucvtf v1.8h, v2.8h
+; CHECK-FP16-GI-NEXT:    ret
   %1 = uitofp <16 x i8> %a to <16 x half>
   ret <16 x half> %1
 }
 
 
 define <8 x half> @uitofp_i16(<8 x i16> %a) #0 {
-; CHECK-CVT-LABEL: uitofp_i16:
-; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    ushll v1.4s, v0.4h, #0
-; CHECK-CVT-NEXT:    ushll2 v2.4s, v0.8h, #0
-; CHECK-CVT-NEXT:    ucvtf v1.4s, v1.4s
-; CHECK-CVT-NEXT:    fcvtn v0.4h, v1.4s
-; CHECK-CVT-NEXT:    ucvtf v1.4s, v2.4s
-; CHECK-CVT-NEXT:    fcvtn2 v0.8h, v1.4s
-; CHECK-CVT-NEXT:    ret
+; CHECK-CVT-SD-LABEL: uitofp_i16:
+; CHECK-CVT-SD:       // %bb.0:
+; CHECK-CVT-SD-NEXT:    ushll v1.4s, v0.4h, #0
+; CHECK-CVT-SD-NEXT:    ushll2 v2.4s, v0.8h, #0
+; CHECK-CVT-SD-NEXT:    ucvtf v1.4s, v1.4s
+; CHECK-CVT-SD-NEXT:    fcvtn v0.4h, v1.4s
+; CHECK-CVT-SD-NEXT:    ucvtf v1.4s, v2.4s
+; CHECK-CVT-SD-NEXT:    fcvtn2 v0.8h, v1.4s
+; CHECK-CVT-SD-NEXT:    ret
 ;
 ; CHECK-FP16-LABEL: uitofp_i16:
 ; CHECK-FP16:       // %bb.0:
 ; CHECK-FP16-NEXT:    ucvtf v0.8h, v0.8h
 ; CHECK-FP16-NEXT:    ret
+;
+; CHECK-CVT-GI-LABEL: uitofp_i16:
+; CHECK-CVT-GI:       // %bb.0:
+; CHECK-CVT-GI-NEXT:    ushll v1.4s, v0.4h, #0
+; CHECK-CVT-GI-NEXT:    ushll2 v0.4s, v0.8h, #0
+; CHECK-CVT-GI-NEXT:    ucvtf v1.4s, v1.4s
+; CHECK-CVT-GI-NEXT:    ucvtf v2.4s, v0.4s
+; CHECK-CVT-GI-NEXT:    fcvtn v0.4h, v1.4s
+; CHECK-CVT-GI-NEXT:    fcvtn2 v0.8h, v2.4s
+; CHECK-CVT-GI-NEXT:    ret
   %1 = uitofp <8 x i16> %a to <8 x half>
   ret <8 x half> %1
 }
@@ -407,19 +750,61 @@ define <8 x half> @uitofp_i32(<8 x i32> %a) #0 {
 
 
 define <8 x half> @uitofp_i64(<8 x i64> %a) #0 {
-; CHECK-LABEL: uitofp_i64:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ucvtf v0.2d, v0.2d
-; CHECK-NEXT:    ucvtf v2.2d, v2.2d
-; CHECK-NEXT:    ucvtf v1.2d, v1.2d
-; CHECK-NEXT:    ucvtf v3.2d, v3.2d
-; CHECK-NEXT:    fcvtn v0.2s, v0.2d
-; CHECK-NEXT:    fcvtn v2.2s, v2.2d
-; CHECK-NEXT:    fcvtn2 v0.4s, v1.2d
-; CHECK-NEXT:    fcvtn2 v2.4s, v3.2d
-; CHECK-NEXT:    fcvtn v0.4h, v0.4s
-; CHECK-NEXT:    fcvtn2 v0.8h, v2.4s
-; CHECK-NEXT:    ret
+; CHECK-CVT-SD-LABEL: uitofp_i64:
+; CHECK-CVT-SD:       // %bb.0:
+; CHECK-CVT-SD-NEXT:    ucvtf v0.2d, v0.2d
+; CHECK-CVT-SD-NEXT:    ucvtf v2.2d, v2.2d
+; CHECK-CVT-SD-NEXT:    ucvtf v1.2d, v1.2d
+; CHECK-CVT-SD-NEXT:    ucvtf v3.2d, v3.2d
+; CHECK-CVT-SD-NEXT:    fcvtn v0.2s, v0.2d
+; CHECK-CVT-SD-NEXT:    fcvtn v2.2s, v2.2d
+; CHECK-CVT-SD-NEXT:    fcvtn2 v0.4s, v1.2d
+; CHECK-CVT-SD-NEXT:    fcvtn2 v2.4s, v3.2d
+; CHECK-CVT-SD-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-CVT-SD-NEXT:    fcvtn2 v0.8h, v2.4s
+; CHECK-CVT-SD-NEXT:    ret
+;
+; CHECK-FP16-SD-LABEL: uitofp_i64:
+; CHECK-FP16-SD:       // %bb.0:
+; CHECK-FP16-SD-NEXT:    ucvtf v0.2d, v0.2d
+; CHECK-FP16-SD-NEXT:    ucvtf v2.2d, v2.2d
+; CHECK-FP16-SD-NEXT:    ucvtf v1.2d, v1.2d
+; CHECK-FP16-SD-NEXT:    ucvtf v3.2d, v3.2d
+; CHECK-FP16-SD-NEXT:    fcvtn v0.2s, v0.2d
+; CHECK-FP16-SD-NEXT:    fcvtn v2.2s, v2.2d
+; CHECK-FP16-SD-NEXT:    fcvtn2 v0.4s, v1.2d
+; CHECK-FP16-SD-NEXT:    fcvtn2 v2.4s, v3.2d
+; CHECK-FP16-SD-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-FP16-SD-NEXT:    fcvtn2 v0.8h, v2.4s
+; CHECK-FP16-SD-NEXT:    ret
+;
+; CHECK-CVT-GI-LABEL: uitofp_i64:
+; CHECK-CVT-GI:       // %bb.0:
+; CHECK-CVT-GI-NEXT:    ucvtf v0.2d, v0.2d
+; CHECK-CVT-GI-NEXT:    ucvtf v1.2d, v1.2d
+; CHECK-CVT-GI-NEXT:    ucvtf v2.2d, v2.2d
+; CHECK-CVT-GI-NEXT:    ucvtf v3.2d, v3.2d
+; CHECK-CVT-GI-NEXT:    fcvtn v0.2s, v0.2d
+; CHECK-CVT-GI-NEXT:    fcvtn v2.2s, v2.2d
+; CHECK-CVT-GI-NEXT:    fcvtn2 v0.4s, v1.2d
+; CHECK-CVT-GI-NEXT:    fcvtn2 v2.4s, v3.2d
+; CHECK-CVT-GI-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-CVT-GI-NEXT:    fcvtn2 v0.8h, v2.4s
+; CHECK-CVT-GI-NEXT:    ret
+;
+; CHECK-FP16-GI-LABEL: uitofp_i64:
+; CHECK-FP16-GI:       // %bb.0:
+; CHECK-FP16-GI-NEXT:    ucvtf v0.2d, v0.2d
+; CHECK-FP16-GI-NEXT:    ucvtf v1.2d, v1.2d
+; CHECK-FP16-GI-NEXT:    ucvtf v2.2d, v2.2d
+; CHECK-FP16-GI-NEXT:    ucvtf v3.2d, v3.2d
+; CHECK-FP16-GI-NEXT:    fcvtn v0.2s, v0.2d
+; CHECK-FP16-GI-NEXT:    fcvtn v2.2s, v2.2d
+; CHECK-FP16-GI-NEXT:    fcvtn2 v0.4s, v1.2d
+; CHECK-FP16-GI-NEXT:    fcvtn2 v2.4s, v3.2d
+; CHECK-FP16-GI-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-FP16-GI-NEXT:    fcvtn2 v0.8h, v2.4s
+; CHECK-FP16-GI-NEXT:    ret
   %1 = uitofp <8 x i64> %a to <8 x half>
   ret <8 x half> %1
 }
@@ -436,94 +821,132 @@ define void @test_insert_at_zero(half %a, ptr %b) #0 {
 }
 
 define <8 x i8> @fptosi_i8(<8 x half> %a) #0 {
-; CHECK-CVT-LABEL: fptosi_i8:
-; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    fcvtl2 v1.4s, v0.8h
-; CHECK-CVT-NEXT:    fcvtl v0.4s, v0.4h
-; CHECK-CVT-NEXT:    fcvtzs v1.4s, v1.4s
-; CHECK-CVT-NEXT:    fcvtzs v0.4s, v0.4s
-; CHECK-CVT-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
-; CHECK-CVT-NEXT:    xtn v0.8b, v0.8h
-; CHECK-CVT-NEXT:    ret
+; CHECK-CVT-SD-LABEL: fptosi_i8:
+; CHECK-CVT-SD:       // %bb.0:
+; CHECK-CVT-SD-NEXT:    fcvtl2 v1.4s, v0.8h
+; CHECK-CVT-SD-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-CVT-SD-NEXT:    fcvtzs v1.4s, v1.4s
+; CHECK-CVT-SD-NEXT:    fcvtzs v0.4s, v0.4s
+; CHECK-CVT-SD-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
+; CHECK-CVT-SD-NEXT:    xtn v0.8b, v0.8h
+; CHECK-CVT-SD-NEXT:    ret
 ;
 ; CHECK-FP16-LABEL: fptosi_i8:
 ; CHECK-FP16:       // %bb.0:
 ; CHECK-FP16-NEXT:    fcvtzs v0.8h, v0.8h
 ; CHECK-FP16-NEXT:    xtn v0.8b, v0.8h
 ; CHECK-FP16-NEXT:    ret
+;
+; CHECK-CVT-GI-LABEL: fptosi_i8:
+; CHECK-CVT-GI:       // %bb.0:
+; CHECK-CVT-GI-NEXT:    fcvtl v1.4s, v0.4h
+; CHECK-CVT-GI-NEXT:    fcvtl2 v0.4s, v0.8h
+; CHECK-CVT-GI-NEXT:    fcvtzs v1.4s, v1.4s
+; CHECK-CVT-GI-NEXT:    fcvtzs v0.4s, v0.4s
+; CHECK-CVT-GI-NEXT:    uzp1 v0.8h, v1.8h, v0.8h
+; CHECK-CVT-GI-NEXT:    xtn v0.8b, v0.8h
+; CHECK-CVT-GI-NEXT:    ret
   %1 = fptosi<8 x half> %a to <8 x i8>
   ret <8 x i8> %1
 }
 
 define <8 x i16> @fptosi_i16(<8 x half> %a) #0 {
-; CHECK-CVT-LABEL: fptosi_i16:
-; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    fcvtl2 v1.4s, v0.8h
-; CHECK-CVT-NEXT:    fcvtl v0.4s, v0.4h
-; CHECK-CVT-NEXT:    fcvtzs v1.4s, v1.4s
-; CHECK-CVT-NEXT:    fcvtzs v0.4s, v0.4s
-; CHECK-CVT-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
-; CHECK-CVT-NEXT:    ret
+; CHECK-CVT-SD-LABEL: fptosi_i16:
+; CHECK-CVT-SD:       // %bb.0:
+; CHECK-CVT-SD-NEXT:    fcvtl2 v1.4s, v0.8h
+; CHECK-CVT-SD-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-CVT-SD-NEXT:    fcvtzs v1.4s, v1.4s
+; CHECK-CVT-SD-NEXT:    fcvtzs v0.4s, v0.4s
+; CHECK-CVT-SD-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
+; CHECK-CVT-SD-NEXT:    ret
 ;
 ; CHECK-FP16-LABEL: fptosi_i16:
 ; CHECK-FP16:       // %bb.0:
 ; CHECK-FP16-NEXT:    fcvtzs v0.8h, v0.8h
 ; CHECK-FP16-NEXT:    ret
+;
+; CHECK-CVT-GI-LABEL: fptosi_i16:
+; CHECK-CVT-GI:       // %bb.0:
+; CHECK-CVT-GI-NEXT:    fcvtl v1.4s, v0.4h
+; CHECK-CVT-GI-NEXT:    fcvtl2 v0.4s, v0.8h
+; CHECK-CVT-GI-NEXT:    fcvtzs v1.4s, v1.4s
+; CHECK-CVT-GI-NEXT:    fcvtzs v0.4s, v0.4s
+; CHECK-CVT-GI-NEXT:    uzp1 v0.8h, v1.8h, v0.8h
+; CHECK-CVT-GI-NEXT:    ret
   %1 = fptosi<8 x half> %a to <8 x i16>
   ret <8 x i16> %1
 }
 
 define <8 x i8> @fptoui_i8(<8 x half> %a) #0 {
-; CHECK-CVT-LABEL: fptoui_i8:
-; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    fcvtl2 v1.4s, v0.8h
-; CHECK-CVT-NEXT:    fcvtl v0.4s, v0.4h
-; CHECK-CVT-NEXT:    fcvtzu v1.4s, v1.4s
-; CHECK-CVT-NEXT:    fcvtzu v0.4s, v0.4s
-; CHECK-CVT-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
-; CHECK-CVT-NEXT:    xtn v0.8b, v0.8h
-; CHECK-CVT-NEXT:    ret
+; CHECK-CVT-SD-LABEL: fptoui_i8:
+; CHECK-CVT-SD:       // %bb.0:
+; CHECK-CVT-SD-NEXT:    fcvtl2 v1.4s, v0.8h
+; CHECK-CVT-SD-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-CVT-SD-NEXT:    fcvtzu v1.4s, v1.4s
+; CHECK-CVT-SD-NEXT:    fcvtzu v0.4s, v0.4s
+; CHECK-CVT-SD-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
+; CHECK-CVT-SD-NEXT:    xtn v0.8b, v0.8h
+; CHECK-CVT-SD-NEXT:    ret
 ;
 ; CHECK-FP16-LABEL: fptoui_i8:
 ; CHECK-FP16:       // %bb.0:
 ; CHECK-FP16-NEXT:    fcvtzu v0.8h, v0.8h
 ; CHECK-FP16-NEXT:    xtn v0.8b, v0.8h
 ; CHECK-FP16-NEXT:    ret
+;
+; CHECK-CVT-GI-LABEL: fptoui_i8:
+; CHECK-CVT-GI:       // %bb.0:
+; CHECK-CVT-GI-NEXT:    fcvtl v1.4s, v0.4h
+; CHECK-CVT-GI-NEXT:    fcvtl2 v0.4s, v0.8h
+; CHECK-CVT-GI-NEXT:    fcvtzu v1.4s, v1.4s
+; CHECK-CVT-GI-NEXT:    fcvtzu v0.4s, v0.4s
+; CHECK-CVT-GI-NEXT:    uzp1 v0.8h, v1.8h, v0.8h
+; CHECK-CVT-GI-NEXT:    xtn v0.8b, v0.8h
+; CHECK-CVT-GI-NEXT:    ret
   %1 = fptoui<8 x half> %a to <8 x i8>
   ret <8 x i8> %1
 }
 
 define <8 x i16> @fptoui_i16(<8 x half> %a) #0 {
-; CHECK-CVT-LABEL: fptoui_i16:
-; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    fcvtl2 v1.4s, v0.8h
-; CHECK-CVT-NEXT:    fcvtl v0.4s, v0.4h
-; CHECK-CVT-NEXT:    fcvtzu v1.4s, v1.4s
-; CHECK-CVT-NEXT:    fcvtzu v0.4s, v0.4s
-; CHECK-CVT-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
-; CHECK-CVT-NEXT:    ret
+; CHECK-CVT-SD-LABEL: fptoui_i16:
+; CHECK-CVT-SD:       // %bb.0:
+; CHECK-CVT-SD-NEXT:    fcvtl2 v1.4s, v0.8h
+; CHECK-CVT-SD-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-CVT-SD-NEXT:    fcvtzu v1.4s, v1.4s
+; CHECK-CVT-SD-NEXT:    fcvtzu v0.4s, v0.4s
+; CHECK-CVT-SD-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
+; CHECK-CVT-SD-NEXT:    ret
 ;
 ; CHECK-FP16-LABEL: fptoui_i16:
 ; CHECK-FP16:       // %bb.0:
 ; CHECK-FP16-NEXT:    fcvtzu v0.8h, v0.8h
 ; CHECK-FP16-NEXT:    ret
+;
+; CHECK-CVT-GI-LABEL: fptoui_i16:
+; CHECK-CVT-GI:       // %bb.0:
+; CHECK-CVT-GI-NEXT:    fcvtl v1.4s, v0.4h
+; CHECK-CVT-GI-NEXT:    fcvtl2 v0.4s, v0.8h
+; CHECK-CVT-GI-NEXT:    fcvtzu v1.4s, v1.4s
+; CHECK-CVT-GI-NEXT:    fcvtzu v0.4s, v0.4s
+; CHECK-CVT-GI-NEXT:    uzp1 v0.8h, v1.8h, v0.8h
+; CHECK-CVT-GI-NEXT:    ret
   %1 = fptoui<8 x half> %a to <8 x i16>
   ret <8 x i16> %1
 }
 
 define <8 x i1> @test_fcmp_une(<8 x half> %a, <8 x half> %b) #0 {
-; CHECK-CVT-LABEL: test_fcmp_une:
-; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    fcvtl2 v2.4s, v1.8h
-; CHECK-CVT-NEXT:    fcvtl2 v3.4s, v0.8h
-; CHECK-CVT-NEXT:    fcvtl v1.4s, v1.4h
-; CHECK-CVT-NEXT:    fcvtl v0.4s, v0.4h
-; CHECK-CVT-NEXT:    fcmeq v2.4s, v3.4s, v2.4s
-; CHECK-CVT-NEXT:    fcmeq v0.4s, v0.4s, v1.4s
-; CHECK-CVT-NEXT:    uzp1 v0.8h, v0.8h, v2.8h
-; CHECK-CVT-NEXT:    mvn v0.16b, v0.16b
-; CHECK-CVT-NEXT:    xtn v0.8b, v0.8h
-; CHECK-CVT-NEXT:    ret
+; CHECK-CVT-SD-LABEL: test_fcmp_une:
+; CHECK-CVT-SD:       // %bb.0:
+; CHECK-CVT-SD-NEXT:    fcvtl2 v2.4s, v1.8h
+; CHECK-CVT-SD-NEXT:    fcvtl2 v3.4s, v0.8h
+; CHECK-CVT-SD-NEXT:    fcvtl v1.4s, v1.4h
+; CHECK-CVT-SD-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-CVT-SD-NEXT:    fcmeq v2.4s, v3.4s, v2.4s
+; CHECK-CVT-SD-NEXT:    fcmeq v0.4s, v0.4s, v1.4s
+; CHECK-CVT-SD-NEXT:    uzp1 v0.8h, v0.8h, v2.8h
+; CHECK-CVT-SD-NEXT:    mvn v0.16b, v0.16b
+; CHECK-CVT-SD-NEXT:    xtn v0.8b, v0.8h
+; CHECK-CVT-SD-NEXT:    ret
 ;
 ; CHECK-FP16-LABEL: test_fcmp_une:
 ; CHECK-FP16:       // %bb.0:
@@ -531,27 +954,41 @@ define <8 x i1> @test_fcmp_une(<8 x half> %a, <8 x half> %b) #0 {
 ; CHECK-FP16-NEXT:    mvn v0.16b, v0.16b
 ; CHECK-FP16-NEXT:    xtn v0.8b, v0.8h
 ; CHECK-FP16-NEXT:    ret
+;
+; CHECK-CVT-GI-LABEL: test_fcmp_une:
+; CHECK-CVT-GI:       // %bb.0:
+; CHECK-CVT-GI-NEXT:    fcvtl v2.4s, v0.4h
+; CHECK-CVT-GI-NEXT:    fcvtl2 v0.4s, v0.8h
+; CHECK-CVT-GI-NEXT:    fcvtl v3.4s, v1.4h
+; CHECK-CVT-GI-NEXT:    fcvtl2 v1.4s, v1.8h
+; CHECK-CVT-GI-NEXT:    fcmeq v2.4s, v2.4s, v3.4s
+; CHECK-CVT-GI-NEXT:    fcmeq v0.4s, v0.4s, v1.4s
+; CHECK-CVT-GI-NEXT:    mvn v1.16b, v2.16b
+; CHECK-CVT-GI-NEXT:    mvn v0.16b, v0.16b
+; CHECK-CVT-GI-NEXT:    uzp1 v0.8h, v1.8h, v0.8h
+; CHECK-CVT-GI-NEXT:    xtn v0.8b, v0.8h
+; CHECK-CVT-GI-NEXT:    ret
   %1 = fcmp une <8 x half> %a, %b
   ret <8 x i1> %1
 }
 
 define <8 x i1> @test_fcmp_ueq(<8 x half> %a, <8 x half> %b) #0 {
-; CHECK-CVT-LABEL: test_fcmp_ueq:
-; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    fcvtl2 v2.4s, v1.8h
-; CHECK-CVT-NEXT:    fcvtl2 v3.4s, v0.8h
-; CHECK-CVT-NEXT:    fcvtl v1.4s, v1.4h
-; CHECK-CVT-NEXT:    fcvtl v0.4s, v0.4h
-; CHECK-CVT-NEXT:    fcmgt v4.4s, v3.4s, v2.4s
-; CHECK-CVT-NEXT:    fcmgt v2.4s, v2.4s, v3.4s
-; CHECK-CVT-NEXT:    fcmgt v3.4s, v0.4s, v1.4s
-; CHECK-CVT-NEXT:    fcmgt v0.4s, v1.4s, v0.4s
-; CHECK-CVT-NEXT:    orr v1.16b, v2.16b, v4.16b
-; CHECK-CVT-NEXT:    orr v0.16b, v0.16b, v3.16b
-; CHECK-CVT-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
-; CHECK-CVT-NEXT:    mvn v0.16b, v0.16b
-; CHECK-CVT-NEXT:    xtn v0.8b, v0.8h
-; CHECK-CVT-NEXT:    ret
+; CHECK-CVT-SD-LABEL: test_fcmp_ueq:
+; CHECK-CVT-SD:       // %bb.0:
+; CHECK-CVT-SD-NEXT:    fcvtl2 v2.4s, v1.8h
+; CHECK-CVT-SD-NEXT:    fcvtl2 v3.4s, v0.8h
+; CHECK-CVT-SD-NEXT:    fcvtl v1.4s, v1.4h
+; CHECK-CVT-SD-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-CVT-SD-NEXT:    fcmgt v4.4s, v3.4s, v2.4s
+; CHECK-CVT-SD-NEXT:    fcmgt v2.4s, v2.4s, v3.4s
+; CHECK-CVT-SD-NEXT:    fcmgt v3.4s, v0.4s, v1.4s
+; CHECK-CVT-SD-NEXT:    fcmgt v0.4s, v1.4s, v0.4s
+; CHECK-CVT-SD-NEXT:    orr v1.16b, v2.16b, v4.16b
+; CHECK-CVT-SD-NEXT:    orr v0.16b, v0.16b, v3.16b
+; CHECK-CVT-SD-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
+; CHECK-CVT-SD-NEXT:    mvn v0.16b, v0.16b
+; CHECK-CVT-SD-NEXT:    xtn v0.8b, v0.8h
+; CHECK-CVT-SD-NEXT:    ret
 ;
 ; CHECK-FP16-LABEL: test_fcmp_ueq:
 ; CHECK-FP16:       // %bb.0:
@@ -561,23 +998,41 @@ define <8 x i1> @test_fcmp_ueq(<8 x half> %a, <8 x half> %b) #0 {
 ; CHECK-FP16-NEXT:    mvn v0.16b, v0.16b
 ; CHECK-FP16-NEXT:    xtn v0.8b, v0.8h
 ; CHECK-FP16-NEXT:    ret
+;
+; CHECK-CVT-GI-LABEL: test_fcmp_ueq:
+; CHECK-CVT-GI:       // %bb.0:
+; CHECK-CVT-GI-NEXT:    fcvtl v2.4s, v0.4h
+; CHECK-CVT-GI-NEXT:    fcvtl v3.4s, v1.4h
+; CHECK-CVT-GI-NEXT:    fcvtl2 v0.4s, v0.8h
+; CHECK-CVT-GI-NEXT:    fcvtl2 v1.4s, v1.8h
+; CHECK-CVT-GI-NEXT:    fcmgt v4.4s, v2.4s, v3.4s
+; CHECK-CVT-GI-NEXT:    fcmgt v2.4s, v3.4s, v2.4s
+; CHECK-CVT-GI-NEXT:    fcmgt v3.4s, v0.4s, v1.4s
+; CHECK-CVT-GI-NEXT:    fcmgt v0.4s, v1.4s, v0.4s
+; CHECK-CVT-GI-NEXT:    orr v1.16b, v2.16b, v4.16b
+; CHECK-CVT-GI-NEXT:    orr v0.16b, v0.16b, v3.16b
+; CHECK-CVT-GI-NEXT:    mvn v1.16b, v1.16b
+; CHECK-CVT-GI-NEXT:    mvn v0.16b, v0.16b
+; CHECK-CVT-GI-NEXT:    uzp1 v0.8h, v1.8h, v0.8h
+; CHECK-CVT-GI-NEXT:    xtn v0.8b, v0.8h
+; CHECK-CVT-GI-NEXT:    ret
   %1 = fcmp ueq <8 x half> %a, %b
   ret <8 x i1> %1
 }
 
 define <8 x i1> @test_fcmp_ugt(<8 x half> %a, <8 x half> %b) #0 {
-; CHECK-CVT-LABEL: test_fcmp_ugt:
-; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    fcvtl2 v2.4s, v0.8h
-; CHECK-CVT-NEXT:    fcvtl2 v3.4s, v1.8h
-; CHECK-CVT-NEXT:    fcvtl v0.4s, v0.4h
-; CHECK-CVT-NEXT:    fcvtl v1.4s, v1.4h
-; CHECK-CVT-NEXT:    fcmge v2.4s, v3.4s, v2.4s
-; CHECK-CVT-NEXT:    fcmge v0.4s, v1.4s, v0.4s
-; CHECK-CVT-NEXT:    uzp1 v0.8h, v0.8h, v2.8h
-; CHECK-CVT-NEXT:    mvn v0.16b, v0.16b
-; CHECK-CVT-NEXT:    xtn v0.8b, v0.8h
-; CHECK-CVT-NEXT:    ret
+; CHECK-CVT-SD-LABEL: test_fcmp_ugt:
+; CHECK-CVT-SD:       // %bb.0:
+; CHECK-CVT-SD-NEXT:    fcvtl2 v2.4s, v0.8h
+; CHECK-CVT-SD-NEXT:    fcvtl2 v3.4s, v1.8h
+; CHECK-CVT-SD-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-CVT-SD-NEXT:    fcvtl v1.4s, v1.4h
+; CHECK-CVT-SD-NEXT:    fcmge v2.4s, v3.4s, v2.4s
+; CHECK-CVT-SD-NEXT:    fcmge v0.4s, v1.4s, v0.4s
+; CHECK-CVT-SD-NEXT:    uzp1 v0.8h, v0.8h, v2.8h
+; CHECK-CVT-SD-NEXT:    mvn v0.16b, v0.16b
+; CHECK-CVT-SD-NEXT:    xtn v0.8b, v0.8h
+; CHECK-CVT-SD-NEXT:    ret
 ;
 ; CHECK-FP16-LABEL: test_fcmp_ugt:
 ; CHECK-FP16:       // %bb.0:
@@ -585,23 +1040,37 @@ define <8 x i1> @test_fcmp_ugt(<8 x half> %a, <8 x half> %b) #0 {
 ; CHECK-FP16-NEXT:    mvn v0.16b, v0.16b
 ; CHECK-FP16-NEXT:    xtn v0.8b, v0.8h
 ; CHECK-FP16-NEXT:    ret
+;
+; CHECK-CVT-GI-LABEL: test_fcmp_ugt:
+; CHECK-CVT-GI:       // %bb.0:
+; CHECK-CVT-GI-NEXT:    fcvtl v2.4s, v0.4h
+; CHECK-CVT-GI-NEXT:    fcvtl2 v0.4s, v0.8h
+; CHECK-CVT-GI-NEXT:    fcvtl v3.4s, v1.4h
+; CHECK-CVT-GI-NEXT:    fcvtl2 v1.4s, v1.8h
+; CHECK-CVT-GI-NEXT:    fcmge v2.4s, v3.4s, v2.4s
+; CHECK-CVT-GI-NEXT:    fcmge v0.4s, v1.4s, v0.4s
+; CHECK-CVT-GI-NEXT:    mvn v1.16b, v2.16b
+; CHECK-CVT-GI-NEXT:    mvn v0.16b, v0.16b
+; CHECK-CVT-GI-NEXT:    uzp1 v0.8h, v1.8h, v0.8h
+; CHECK-CVT-GI-NEXT:    xtn v0.8b, v0.8h
+; CHECK-CVT-GI-NEXT:    ret
   %1 = fcmp ugt <8 x half> %a, %b
   ret <8 x i1> %1
 }
 
 define <8 x i1> @test_fcmp_uge(<8 x half> %a, <8 x half> %b) #0 {
-; CHECK-CVT-LABEL: test_fcmp_uge:
-; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    fcvtl2 v2.4s, v0.8h
-; CHECK-CVT-NEXT:    fcvtl2 v3.4s, v1.8h
-; CHECK-CVT-NEXT:    fcvtl v0.4s, v0.4h
-; CHECK-CVT-NEXT:    fcvtl v1.4s, v1.4h
-; CHECK-CVT-NEXT:    fcmgt v2.4s, v3.4s, v2.4s
-; CHECK-CVT-NEXT:    fcmgt v0.4s, v1.4s, v0.4s
-; CHECK-CVT-NEXT:    uzp1 v0.8h, v0.8h, v2.8h
-; CHECK-CVT-NEXT:    mvn v0.16b, v0.16b
-; CHECK-CVT-NEXT:    xtn v0.8b, v0.8h
-; CHECK-CVT-NEXT:    ret
+; CHECK-CVT-SD-LABEL: test_fcmp_uge:
+; CHECK-CVT-SD:       // %bb.0:
+; CHECK-CVT-SD-NEXT:    fcvtl2 v2.4s, v0.8h
+; CHECK-CVT-SD-NEXT:    fcvtl2 v3.4s, v1.8h
+; CHECK-CVT-SD-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-CVT-SD-NEXT:    fcvtl v1.4s, v1.4h
+; CHECK-CVT-SD-NEXT:    fcmgt v2.4s, v3.4s, v2.4s
+; CHECK-CVT-SD-NEXT:    fcmgt v0.4s, v1.4s, v0.4s
+; CHECK-CVT-SD-NEXT:    uzp1 v0.8h, v0.8h, v2.8h
+; CHECK-CVT-SD-NEXT:    mvn v0.16b, v0.16b
+; CHECK-CVT-SD-NEXT:    xtn v0.8b, v0.8h
+; CHECK-CVT-SD-NEXT:    ret
 ;
 ; CHECK-FP16-LABEL: test_fcmp_uge:
 ; CHECK-FP16:       // %bb.0:
@@ -609,23 +1078,37 @@ define <8 x i1> @test_fcmp_uge(<8 x half> %a, <8 x half> %b) #0 {
 ; CHECK-FP16-NEXT:    mvn v0.16b, v0.16b
 ; CHECK-FP16-NEXT:    xtn v0.8b, v0.8h
 ; CHECK-FP16-NEXT:    ret
+;
+; CHECK-CVT-GI-LABEL: test_fcmp_uge:
+; CHECK-CVT-GI:       // %bb.0:
+; CHECK-CVT-GI-NEXT:    fcvtl v2.4s, v0.4h
+; CHECK-CVT-GI-NEXT:    fcvtl2 v0.4s, v0.8h
+; CHECK-CVT-GI-NEXT:    fcvtl v3.4s, v1.4h
+; CHECK-CVT-GI-NEXT:    fcvtl2 v1.4s, v1.8h
+; CHECK-CVT-GI-NEXT:    fcmgt v2.4s, v3.4s, v2.4s
+; CHECK-CVT-GI-NEXT:    fcmgt v0.4s, v1.4s, v0.4s
+; CHECK-CVT-GI-NEXT:    mvn v1.16b, v2.16b
+; CHECK-CVT-GI-NEXT:    mvn v0.16b, v0.16b
+; CHECK-CVT-GI-NEXT:    uzp1 v0.8h, v1.8h, v0.8h
+; CHECK-CVT-GI-NEXT:    xtn v0.8b, v0.8h
+; CHECK-CVT-GI-NEXT:    ret
   %1 = fcmp uge <8 x half> %a, %b
   ret <8 x i1> %1
 }
 
 define <8 x i1> @test_fcmp_ult(<8 x half> %a, <8 x half> %b) #0 {
-; CHECK-CVT-LABEL: test_fcmp_ult:
-; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    fcvtl2 v2.4s, v1.8h
-; CHECK-CVT-NEXT:    fcvtl2 v3.4s, v0.8h
-; CHECK-CVT-NEXT:    fcvtl v1.4s, v1.4h
-; CHECK-CVT-NEXT:    fcvtl v0.4s, v0.4h
-; CHECK-CVT-NEXT:    fcmge v2.4s, v3.4s, v2.4s
-; CHECK-CVT-NEXT:    fcmge v0.4s, v0.4s, v1.4s
-; CHECK-CVT-NEXT:    uzp1 v0.8h, v0.8h, v2.8h
-; CHECK-CVT-NEXT:    mvn v0.16b, v0.16b
-; CHECK-CVT-NEXT:    xtn v0.8b, v0.8h
-; CHECK-CVT-NEXT:    ret
+; CHECK-CVT-SD-LABEL: test_fcmp_ult:
+; CHECK-CVT-SD:       // %bb.0:
+; CHECK-CVT-SD-NEXT:    fcvtl2 v2.4s, v1.8h
+; CHECK-CVT-SD-NEXT:    fcvtl2 v3.4s, v0.8h
+; CHECK-CVT-SD-NEXT:    fcvtl v1.4s, v1.4h
+; CHECK-CVT-SD-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-CVT-SD-NEXT:    fcmge v2.4s, v3.4s, v2.4s
+; CHECK-CVT-SD-NEXT:    fcmge v0.4s, v0.4s, v1.4s
+; CHECK-CVT-SD-NEXT:    uzp1 v0.8h, v0.8h, v2.8h
+; CHECK-CVT-SD-NEXT:    mvn v0.16b, v0.16b
+; CHECK-CVT-SD-NEXT:    xtn v0.8b, v0.8h
+; CHECK-CVT-SD-NEXT:    ret
 ;
 ; CHECK-FP16-LABEL: test_fcmp_ult:
 ; CHECK-FP16:       // %bb.0:
@@ -633,23 +1116,37 @@ define <8 x i1> @test_fcmp_ult(<8 x half> %a, <8 x half> %b) #0 {
 ; CHECK-FP16-NEXT:    mvn v0.16b, v0.16b
 ; CHECK-FP16-NEXT:    xtn v0.8b, v0.8h
 ; CHECK-FP16-NEXT:    ret
+;
+; CHECK-CVT-GI-LABEL: test_fcmp_ult:
+; CHECK-CVT-GI:       // %bb.0:
+; CHECK-CVT-GI-NEXT:    fcvtl v2.4s, v0.4h
+; CHECK-CVT-GI-NEXT:    fcvtl2 v0.4s, v0.8h
+; CHECK-CVT-GI-NEXT:    fcvtl v3.4s, v1.4h
+; CHECK-CVT-GI-NEXT:    fcvtl2 v1.4s, v1.8h
+; CHECK-CVT-GI-NEXT:    fcmge v2.4s, v2.4s, v3.4s
+; CHECK-CVT-GI-NEXT:    fcmge v0.4s, v0.4s, v1.4s
+; CHECK-CVT-GI-NEXT:    mvn v1.16b, v2.16b
+; CHECK-CVT-GI-NEXT:    mvn v0.16b, v0.16b
+; CHECK-CVT-GI-NEXT:    uzp1 v0.8h, v1.8h, v0.8h
+; CHECK-CVT-GI-NEXT:    xtn v0.8b, v0.8h
+; CHECK-CVT-GI-NEXT:    ret
   %1 = fcmp ult <8 x half> %a, %b
   ret <8 x i1> %1
 }
 
 define <8 x i1> @test_fcmp_ule(<8 x half> %a, <8 x half> %b) #0 {
-; CHECK-CVT-LABEL: test_fcmp_ule:
-; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    fcvtl2 v2.4s, v1.8h
-; CHECK-CVT-NEXT:    fcvtl2 v3.4s, v0.8h
-; CHECK-CVT-NEXT:    fcvtl v1.4s, v1.4h
-; CHECK-CVT-NEXT:    fcvtl v0.4s, v0.4h
-; CHECK-CVT-NEXT:    fcmgt v2.4s, v3.4s, v2.4s
-; CHECK-CVT-NEXT:    fcmgt v0.4s, v0.4s, v1.4s
-; CHECK-CVT-NEXT:    uzp1 v0.8h, v0.8h, v2.8h
-; CHECK-CVT-NEXT:    mvn v0.16b, v0.16b
-; CHECK-CVT-NEXT:    xtn v0.8b, v0.8h
-; CHECK-CVT-NEXT:    ret
+; CHECK-CVT-SD-LABEL: test_fcmp_ule:
+; CHECK-CVT-SD:       // %bb.0:
+; CHECK-CVT-SD-NEXT:    fcvtl2 v2.4s, v1.8h
+; CHECK-CVT-SD-NEXT:    fcvtl2 v3.4s, v0.8h
+; CHECK-CVT-SD-NEXT:    fcvtl v1.4s, v1.4h
+; CHECK-CVT-SD-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-CVT-SD-NEXT:    fcmgt v2.4s, v3.4s, v2.4s
+; CHECK-CVT-SD-NEXT:    fcmgt v0.4s, v0.4s, v1.4s
+; CHECK-CVT-SD-NEXT:    uzp1 v0.8h, v0.8h, v2.8h
+; CHECK-CVT-SD-NEXT:    mvn v0.16b, v0.16b
+; CHECK-CVT-SD-NEXT:    xtn v0.8b, v0.8h
+; CHECK-CVT-SD-NEXT:    ret
 ;
 ; CHECK-FP16-LABEL: test_fcmp_ule:
 ; CHECK-FP16:       // %bb.0:
@@ -657,27 +1154,41 @@ define <8 x i1> @test_fcmp_ule(<8 x half> %a, <8 x half> %b) #0 {
 ; CHECK-FP16-NEXT:    mvn v0.16b, v0.16b
 ; CHECK-FP16-NEXT:    xtn v0.8b, v0.8h
 ; CHECK-FP16-NEXT:    ret
+;
+; CHECK-CVT-GI-LABEL: test_fcmp_ule:
+; CHECK-CVT-GI:       // %bb.0:
+; CHECK-CVT-GI-NEXT:    fcvtl v2.4s, v0.4h
+; CHECK-CVT-GI-NEXT:    fcvtl2 v0.4s, v0.8h
+; CHECK-CVT-GI-NEXT:    fcvtl v3.4s, v1.4h
+; CHECK-CVT-GI-NEXT:    fcvtl2 v1.4s, v1.8h
+; CHECK-CVT-GI-NEXT:    fcmgt v2.4s, v2.4s, v3.4s
+; CHECK-CVT-GI-NEXT:    fcmgt v0.4s, v0.4s, v1.4s
+; CHECK-CVT-GI-NEXT:    mvn v1.16b, v2.16b
+; CHECK-CVT-GI-NEXT:    mvn v0.16b, v0.16b
+; CHECK-CVT-GI-NEXT:    uzp1 v0.8h, v1.8h, v0.8h
+; CHECK-CVT-GI-NEXT:    xtn v0.8b, v0.8h
+; CHECK-CVT-GI-NEXT:    ret
   %1 = fcmp ule <8 x half> %a, %b
   ret <8 x i1> %1
 }
 
 define <8 x i1> @test_fcmp_uno(<8 x half> %a, <8 x half> %b) #0 {
-; CHECK-CVT-LABEL: test_fcmp_uno:
-; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    fcvtl2 v2.4s, v1.8h
-; CHECK-CVT-NEXT:    fcvtl2 v3.4s, v0.8h
-; CHECK-CVT-NEXT:    fcvtl v1.4s, v1.4h
-; CHECK-CVT-NEXT:    fcvtl v0.4s, v0.4h
-; CHECK-CVT-NEXT:    fcmge v4.4s, v3.4s, v2.4s
-; CHECK-CVT-NEXT:    fcmgt v2.4s, v2.4s, v3.4s
-; CHECK-CVT-NEXT:    fcmge v3.4s, v0.4s, v1.4s
-; CHECK-CVT-NEXT:    fcmgt v0.4s, v1.4s, v0.4s
-; CHECK-CVT-NEXT:    orr v1.16b, v2.16b, v4.16b
-; CHECK-CVT-NEXT:    orr v0.16b, v0.16b, v3.16b
-; CHECK-CVT-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
-; CHECK-CVT-NEXT:    mvn v0.16b, v0.16b
-; CHECK-CVT-NEXT:    xtn v0.8b, v0.8h
-; CHECK-CVT-NEXT:    ret
+; CHECK-CVT-SD-LABEL: test_fcmp_uno:
+; CHECK-CVT-SD:       // %bb.0:
+; CHECK-CVT-SD-NEXT:    fcvtl2 v2.4s, v1.8h
+; CHECK-CVT-SD-NEXT:    fcvtl2 v3.4s, v0.8h
+; CHECK-CVT-SD-NEXT:    fcvtl v1.4s, v1.4h
+; CHECK-CVT-SD-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-CVT-SD-NEXT:    fcmge v4.4s, v3.4s, v2.4s
+; CHECK-CVT-SD-NEXT:    fcmgt v2.4s, v2.4s, v3.4s
+; CHECK-CVT-SD-NEXT:    fcmge v3.4s, v0.4s, v1.4s
+; CHECK-CVT-SD-NEXT:    fcmgt v0.4s, v1.4s, v0.4s
+; CHECK-CVT-SD-NEXT:    orr v1.16b, v2.16b, v4.16b
+; CHECK-CVT-SD-NEXT:    orr v0.16b, v0.16b, v3.16b
+; CHECK-CVT-SD-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
+; CHECK-CVT-SD-NEXT:    mvn v0.16b, v0.16b
+; CHECK-CVT-SD-NEXT:    xtn v0.8b, v0.8h
+; CHECK-CVT-SD-NEXT:    ret
 ;
 ; CHECK-FP16-LABEL: test_fcmp_uno:
 ; CHECK-FP16:       // %bb.0:
@@ -687,26 +1198,44 @@ define <8 x i1> @test_fcmp_uno(<8 x half> %a, <8 x half> %b) #0 {
 ; CHECK-FP16-NEXT:    mvn v0.16b, v0.16b
 ; CHECK-FP16-NEXT:    xtn v0.8b, v0.8h
 ; CHECK-FP16-NEXT:    ret
+;
+; CHECK-CVT-GI-LABEL: test_fcmp_uno:
+; CHECK-CVT-GI:       // %bb.0:
+; CHECK-CVT-GI-NEXT:    fcvtl v2.4s, v0.4h
+; CHECK-CVT-GI-NEXT:    fcvtl v3.4s, v1.4h
+; CHECK-CVT-GI-NEXT:    fcvtl2 v0.4s, v0.8h
+; CHECK-CVT-GI-NEXT:    fcvtl2 v1.4s, v1.8h
+; CHECK-CVT-GI-NEXT:    fcmge v4.4s, v2.4s, v3.4s
+; CHECK-CVT-GI-NEXT:    fcmgt v2.4s, v3.4s, v2.4s
+; CHECK-CVT-GI-NEXT:    fcmge v3.4s, v0.4s, v1.4s
+; CHECK-CVT-GI-NEXT:    fcmgt v0.4s, v1.4s, v0.4s
+; CHECK-CVT-GI-NEXT:    orr v1.16b, v2.16b, v4.16b
+; CHECK-CVT-GI-NEXT:    orr v0.16b, v0.16b, v3.16b
+; CHECK-CVT-GI-NEXT:    mvn v1.16b, v1.16b
+; CHECK-CVT-GI-NEXT:    mvn v0.16b, v0.16b
+; CHECK-CVT-GI-NEXT:    uzp1 v0.8h, v1.8h, v0.8h
+; CHECK-CVT-GI-NEXT:    xtn v0.8b, v0.8h
+; CHECK-CVT-GI-NEXT:    ret
   %1 = fcmp uno <8 x half> %a, %b
   ret <8 x i1> %1
 }
 
 define <8 x i1> @test_fcmp_one(<8 x half> %a, <8 x half> %b) #0 {
-; CHECK-CVT-LABEL: test_fcmp_one:
-; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    fcvtl2 v2.4s, v1.8h
-; CHECK-CVT-NEXT:    fcvtl2 v3.4s, v0.8h
-; CHECK-CVT-NEXT:    fcvtl v1.4s, v1.4h
-; CHECK-CVT-NEXT:    fcvtl v0.4s, v0.4h
-; CHECK-CVT-NEXT:    fcmgt v4.4s, v3.4s, v2.4s
-; CHECK-CVT-NEXT:    fcmgt v2.4s, v2.4s, v3.4s
-; CHECK-CVT-NEXT:    fcmgt v3.4s, v0.4s, v1.4s
-; CHECK-CVT-NEXT:    fcmgt v0.4s, v1.4s, v0.4s
-; CHECK-CVT-NEXT:    orr v1.16b, v2.16b, v4.16b
-; CHECK-CVT-NEXT:    orr v0.16b, v0.16b, v3.16b
-; CHECK-CVT-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
-; CHECK-CVT-NEXT:    xtn v0.8b, v0.8h
-; CHECK-CVT-NEXT:    ret
+; CHECK-CVT-SD-LABEL: test_fcmp_one:
+; CHECK-CVT-SD:       // %bb.0:
+; CHECK-CVT-SD-NEXT:    fcvtl2 v2.4s, v1.8h
+; CHECK-CVT-SD-NEXT:    fcvtl2 v3.4s, v0.8h
+; CHECK-CVT-SD-NEXT:    fcvtl v1.4s, v1.4h
+; CHECK-CVT-SD-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-CVT-SD-NEXT:    fcmgt v4.4s, v3.4s, v2.4s
+; CHECK-CVT-SD-NEXT:    fcmgt v2.4s, v2.4s, v3.4s
+; CHECK-CVT-SD-NEXT:    fcmgt v3.4s, v0.4s, v1.4s
+; CHECK-CVT-SD-NEXT:    fcmgt v0.4s, v1.4s, v0.4s
+; CHECK-CVT-SD-NEXT:    orr v1.16b, v2.16b, v4.16b
+; CHECK-CVT-SD-NEXT:    orr v0.16b, v0.16b, v3.16b
+; CHECK-CVT-SD-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
+; CHECK-CVT-SD-NEXT:    xtn v0.8b, v0.8h
+; CHECK-CVT-SD-NEXT:    ret
 ;
 ; CHECK-FP16-LABEL: test_fcmp_one:
 ; CHECK-FP16:       // %bb.0:
@@ -715,136 +1244,212 @@ define <8 x i1> @test_fcmp_one(<8 x half> %a, <8 x half> %b) #0 {
 ; CHECK-FP16-NEXT:    orr v0.16b, v0.16b, v2.16b
 ; CHECK-FP16-NEXT:    xtn v0.8b, v0.8h
 ; CHECK-FP16-NEXT:    ret
+;
+; CHECK-CVT-GI-LABEL: test_fcmp_one:
+; CHECK-CVT-GI:       // %bb.0:
+; CHECK-CVT-GI-NEXT:    fcvtl v2.4s, v0.4h
+; CHECK-CVT-GI-NEXT:    fcvtl v3.4s, v1.4h
+; CHECK-CVT-GI-NEXT:    fcvtl2 v0.4s, v0.8h
+; CHECK-CVT-GI-NEXT:    fcvtl2 v1.4s, v1.8h
+; CHECK-CVT-GI-NEXT:    fcmgt v4.4s, v2.4s, v3.4s
+; CHECK-CVT-GI-NEXT:    fcmgt v2.4s, v3.4s, v2.4s
+; CHECK-CVT-GI-NEXT:    fcmgt v3.4s, v0.4s, v1.4s
+; CHECK-CVT-GI-NEXT:    fcmgt v0.4s, v1.4s, v0.4s
+; CHECK-CVT-GI-NEXT:    orr v1.16b, v2.16b, v4.16b
+; CHECK-CVT-GI-NEXT:    orr v0.16b, v0.16b, v3.16b
+; CHECK-CVT-GI-NEXT:    uzp1 v0.8h, v1.8h, v0.8h
+; CHECK-CVT-GI-NEXT:    xtn v0.8b, v0.8h
+; CHECK-CVT-GI-NEXT:    ret
   %1 = fcmp one <8 x half> %a, %b
   ret <8 x i1> %1
 }
 
 define <8 x i1> @test_fcmp_oeq(<8 x half> %a, <8 x half> %b) #0 {
-; CHECK-CVT-LABEL: test_fcmp_oeq:
-; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    fcvtl2 v2.4s, v1.8h
-; CHECK-CVT-NEXT:    fcvtl2 v3.4s, v0.8h
-; CHECK-CVT-NEXT:    fcvtl v1.4s, v1.4h
-; CHECK-CVT-NEXT:    fcvtl v0.4s, v0.4h
-; CHECK-CVT-NEXT:    fcmeq v2.4s, v3.4s, v2.4s
-; CHECK-CVT-NEXT:    fcmeq v0.4s, v0.4s, v1.4s
-; CHECK-CVT-NEXT:    uzp1 v0.8h, v0.8h, v2.8h
-; CHECK-CVT-NEXT:    xtn v0.8b, v0.8h
-; CHECK-CVT-NEXT:    ret
+; CHECK-CVT-SD-LABEL: test_fcmp_oeq:
+; CHECK-CVT-SD:       // %bb.0:
+; CHECK-CVT-SD-NEXT:    fcvtl2 v2.4s, v1.8h
+; CHECK-CVT-SD-NEXT:    fcvtl2 v3.4s, v0.8h
+; CHECK-CVT-SD-NEXT:    fcvtl v1.4s, v1.4h
+; CHECK-CVT-SD-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-CVT-SD-NEXT:    fcmeq v2.4s, v3.4s, v2.4s
+; CHECK-CVT-SD-NEXT:    fcmeq v0.4s, v0.4s, v1.4s
+; CHECK-CVT-SD-NEXT:    uzp1 v0.8h, v0.8h, v2.8h
+; CHECK-CVT-SD-NEXT:    xtn v0.8b, v0.8h
+; CHECK-CVT-SD-NEXT:    ret
 ;
 ; CHECK-FP16-LABEL: test_fcmp_oeq:
 ; CHECK-FP16:       // %bb.0:
 ; CHECK-FP16-NEXT:    fcmeq v0.8h, v0.8h, v1.8h
 ; CHECK-FP16-NEXT:    xtn v0.8b, v0.8h
 ; CHECK-FP16-NEXT:    ret
+;
+; CHECK-CVT-GI-LABEL: test_fcmp_oeq:
+; CHECK-CVT-GI:       // %bb.0:
+; CHECK-CVT-GI-NEXT:    fcvtl v2.4s, v0.4h
+; CHECK-CVT-GI-NEXT:    fcvtl2 v0.4s, v0.8h
+; CHECK-CVT-GI-NEXT:    fcvtl v3.4s, v1.4h
+; CHECK-CVT-GI-NEXT:    fcvtl2 v1.4s, v1.8h
+; CHECK-CVT-GI-NEXT:    fcmeq v2.4s, v2.4s, v3.4s
+; CHECK-CVT-GI-NEXT:    fcmeq v0.4s, v0.4s, v1.4s
+; CHECK-CVT-GI-NEXT:    uzp1 v0.8h, v2.8h, v0.8h
+; CHECK-CVT-GI-NEXT:    xtn v0.8b, v0.8h
+; CHECK-CVT-GI-NEXT:    ret
   %1 = fcmp oeq <8 x half> %a, %b
   ret <8 x i1> %1
 }
 
 define <8 x i1> @test_fcmp_ogt(<8 x half> %a, <8 x half> %b) #0 {
-; CHECK-CVT-LABEL: test_fcmp_ogt:
-; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    fcvtl2 v2.4s, v1.8h
-; CHECK-CVT-NEXT:    fcvtl2 v3.4s, v0.8h
-; CHECK-CVT-NEXT:    fcvtl v1.4s, v1.4h
-; CHECK-CVT-NEXT:    fcvtl v0.4s, v0.4h
-; CHECK-CVT-NEXT:    fcmgt v2.4s, v3.4s, v2.4s
-; CHECK-CVT-NEXT:    fcmgt v0.4s, v0.4s, v1.4s
-; CHECK-CVT-NEXT:    uzp1 v0.8h, v0.8h, v2.8h
-; CHECK-CVT-NEXT:    xtn v0.8b, v0.8h
-; CHECK-CVT-NEXT:    ret
+; CHECK-CVT-SD-LABEL: test_fcmp_ogt:
+; CHECK-CVT-SD:       // %bb.0:
+; CHECK-CVT-SD-NEXT:    fcvtl2 v2.4s, v1.8h
+; CHECK-CVT-SD-NEXT:    fcvtl2 v3.4s, v0.8h
+; CHECK-CVT-SD-NEXT:    fcvtl v1.4s, v1.4h
+; CHECK-CVT-SD-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-CVT-SD-NEXT:    fcmgt v2.4s, v3.4s, v2.4s
+; CHECK-CVT-SD-NEXT:    fcmgt v0.4s, v0.4s, v1.4s
+; CHECK-CVT-SD-NEXT:    uzp1 v0.8h, v0.8h, v2.8h
+; CHECK-CVT-SD-NEXT:    xtn v0.8b, v0.8h
+; CHECK-CVT-SD-NEXT:    ret
 ;
 ; CHECK-FP16-LABEL: test_fcmp_ogt:
 ; CHECK-FP16:       // %bb.0:
 ; CHECK-FP16-NEXT:    fcmgt v0.8h, v0.8h, v1.8h
 ; CHECK-FP16-NEXT:    xtn v0.8b, v0.8h
 ; CHECK-FP16-NEXT:    ret
+;
+; CHECK-CVT-GI-LABEL: test_fcmp_ogt:
+; CHECK-CVT-GI:       // %bb.0:
+; CHECK-CVT-GI-NEXT:    fcvtl v2.4s, v0.4h
+; CHECK-CVT-GI-NEXT:    fcvtl2 v0.4s, v0.8h
+; CHECK-CVT-GI-NEXT:    fcvtl v3.4s, v1.4h
+; CHECK-CVT-GI-NEXT:    fcvtl2 v1.4s, v1.8h
+; CHECK-CVT-GI-NEXT:    fcmgt v2.4s, v2.4s, v3.4s
+; CHECK-CVT-GI-NEXT:    fcmgt v0.4s, v0.4s, v1.4s
+; CHECK-CVT-GI-NEXT:    uzp1 v0.8h, v2.8h, v0.8h
+; CHECK-CVT-GI-NEXT:    xtn v0.8b, v0.8h
+; CHECK-CVT-GI-NEXT:    ret
   %1 = fcmp ogt <8 x half> %a, %b
   ret <8 x i1> %1
 }
 
 define <8 x i1> @test_fcmp_oge(<8 x half> %a, <8 x half> %b) #0 {
-; CHECK-CVT-LABEL: test_fcmp_oge:
-; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    fcvtl2 v2.4s, v1.8h
-; CHECK-CVT-NEXT:    fcvtl2 v3.4s, v0.8h
-; CHECK-CVT-NEXT:    fcvtl v1.4s, v1.4h
-; CHECK-CVT-NEXT:    fcvtl v0.4s, v0.4h
-; CHECK-CVT-NEXT:    fcmge v2.4s, v3.4s, v2.4s
-; CHECK-CVT-NEXT:    fcmge v0.4s, v0.4s, v1.4s
-; CHECK-CVT-NEXT:    uzp1 v0.8h, v0.8h, v2.8h
-; CHECK-CVT-NEXT:    xtn v0.8b, v0.8h
-; CHECK-CVT-NEXT:    ret
+; CHECK-CVT-SD-LABEL: test_fcmp_oge:
+; CHECK-CVT-SD:       // %bb.0:
+; CHECK-CVT-SD-NEXT:    fcvtl2 v2.4s, v1.8h
+; CHECK-CVT-SD-NEXT:    fcvtl2 v3.4s, v0.8h
+; CHECK-CVT-SD-NEXT:    fcvtl v1.4s, v1.4h
+; CHECK-CVT-SD-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-CVT-SD-NEXT:    fcmge v2.4s, v3.4s, v2.4s
+; CHECK-CVT-SD-NEXT:    fcmge v0.4s, v0.4s, v1.4s
+; CHECK-CVT-SD-NEXT:    uzp1 v0.8h, v0.8h, v2.8h
+; CHECK-CVT-SD-NEXT:    xtn v0.8b, v0.8h
+; CHECK-CVT-SD-NEXT:    ret
 ;
 ; CHECK-FP16-LABEL: test_fcmp_oge:
 ; CHECK-FP16:       // %bb.0:
 ; CHECK-FP16-NEXT:    fcmge v0.8h, v0.8h, v1.8h
 ; CHECK-FP16-NEXT:    xtn v0.8b, v0.8h
 ; CHECK-FP16-NEXT:    ret
+;
+; CHECK-CVT-GI-LABEL: test_fcmp_oge:
+; CHECK-CVT-GI:       // %bb.0:
+; CHECK-CVT-GI-NEXT:    fcvtl v2.4s, v0.4h
+; CHECK-CVT-GI-NEXT:    fcvtl2 v0.4s, v0.8h
+; CHECK-CVT-GI-NEXT:    fcvtl v3.4s, v1.4h
+; CHECK-CVT-GI-NEXT:    fcvtl2 v1.4s, v1.8h
+; CHECK-CVT-GI-NEXT:    fcmge v2.4s, v2.4s, v3.4s
+; CHECK-CVT-GI-NEXT:    fcmge v0.4s, v0.4s, v1.4s
+; CHECK-CVT-GI-NEXT:    uzp1 v0.8h, v2.8h, v0.8h
+; CHECK-CVT-GI-NEXT:    xtn v0.8b, v0.8h
+; CHECK-CVT-GI-NEXT:    ret
   %1 = fcmp oge <8 x half> %a, %b
   ret <8 x i1> %1
 }
 
 define <8 x i1> @test_fcmp_olt(<8 x half> %a, <8 x half> %b) #0 {
-; CHECK-CVT-LABEL: test_fcmp_olt:
-; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    fcvtl2 v2.4s, v0.8h
-; CHECK-CVT-NEXT:    fcvtl2 v3.4s, v1.8h
-; CHECK-CVT-NEXT:    fcvtl v0.4s, v0.4h
-; CHECK-CVT-NEXT:    fcvtl v1.4s, v1.4h
-; CHECK-CVT-NEXT:    fcmgt v2.4s, v3.4s, v2.4s
-; CHECK-CVT-NEXT:    fcmgt v0.4s, v1.4s, v0.4s
-; CHECK-CVT-NEXT:    uzp1 v0.8h, v0.8h, v2.8h
-; CHECK-CVT-NEXT:    xtn v0.8b, v0.8h
-; CHECK-CVT-NEXT:    ret
+; CHECK-CVT-SD-LABEL: test_fcmp_olt:
+; CHECK-CVT-SD:       // %bb.0:
+; CHECK-CVT-SD-NEXT:    fcvtl2 v2.4s, v0.8h
+; CHECK-CVT-SD-NEXT:    fcvtl2 v3.4s, v1.8h
+; CHECK-CVT-SD-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-CVT-SD-NEXT:    fcvtl v1.4s, v1.4h
+; CHECK-CVT-SD-NEXT:    fcmgt v2.4s, v3.4s, v2.4s
+; CHECK-CVT-SD-NEXT:    fcmgt v0.4s, v1.4s, v0.4s
+; CHECK-CVT-SD-NEXT:    uzp1 v0.8h, v0.8h, v2.8h
+; CHECK-CVT-SD-NEXT:    xtn v0.8b, v0.8h
+; CHECK-CVT-SD-NEXT:    ret
 ;
 ; CHECK-FP16-LABEL: test_fcmp_olt:
 ; CHECK-FP16:       // %bb.0:
 ; CHECK-FP16-NEXT:    fcmgt v0.8h, v1.8h, v0.8h
 ; CHECK-FP16-NEXT:    xtn v0.8b, v0.8h
 ; CHECK-FP16-NEXT:    ret
+;
+; CHECK-CVT-GI-LABEL: test_fcmp_olt:
+; CHECK-CVT-GI:       // %bb.0:
+; CHECK-CVT-GI-NEXT:    fcvtl v2.4s, v0.4h
+; CHECK-CVT-GI-NEXT:    fcvtl2 v0.4s, v0.8h
+; CHECK-CVT-GI-NEXT:    fcvtl v3.4s, v1.4h
+; CHECK-CVT-GI-NEXT:    fcvtl2 v1.4s, v1.8h
+; CHECK-CVT-GI-NEXT:    fcmgt v2.4s, v3.4s, v2.4s
+; CHECK-CVT-GI-NEXT:    fcmgt v0.4s, v1.4s, v0.4s
+; CHECK-CVT-GI-NEXT:    uzp1 v0.8h, v2.8h, v0.8h
+; CHECK-CVT-GI-NEXT:    xtn v0.8b, v0.8h
+; CHECK-CVT-GI-NEXT:    ret
   %1 = fcmp olt <8 x half> %a, %b
   ret <8 x i1> %1
 }
 
 define <8 x i1> @test_fcmp_ole(<8 x half> %a, <8 x half> %b) #0 {
-; CHECK-CVT-LABEL: test_fcmp_ole:
-; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    fcvtl2 v2.4s, v0.8h
-; CHECK-CVT-NEXT:    fcvtl2 v3.4s, v1.8h
-; CHECK-CVT-NEXT:    fcvtl v0.4s, v0.4h
-; CHECK-CVT-NEXT:    fcvtl v1.4s, v1.4h
-; CHECK-CVT-NEXT:    fcmge v2.4s, v3.4s, v2.4s
-; CHECK-CVT-NEXT:    fcmge v0.4s, v1.4s, v0.4s
-; CHECK-CVT-NEXT:    uzp1 v0.8h, v0.8h, v2.8h
-; CHECK-CVT-NEXT:    xtn v0.8b, v0.8h
-; CHECK-CVT-NEXT:    ret
+; CHECK-CVT-SD-LABEL: test_fcmp_ole:
+; CHECK-CVT-SD:       // %bb.0:
+; CHECK-CVT-SD-NEXT:    fcvtl2 v2.4s, v0.8h
+; CHECK-CVT-SD-NEXT:    fcvtl2 v3.4s, v1.8h
+; CHECK-CVT-SD-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-CVT-SD-NEXT:    fcvtl v1.4s, v1.4h
+; CHECK-CVT-SD-NEXT:    fcmge v2.4s, v3.4s, v2.4s
+; CHECK-CVT-SD-NEXT:    fcmge v0.4s, v1.4s, v0.4s
+; CHECK-CVT-SD-NEXT:    uzp1 v0.8h, v0.8h, v2.8h
+; CHECK-CVT-SD-NEXT:    xtn v0.8b, v0.8h
+; CHECK-CVT-SD-NEXT:    ret
 ;
 ; CHECK-FP16-LABEL: test_fcmp_ole:
 ; CHECK-FP16:       // %bb.0:
 ; CHECK-FP16-NEXT:    fcmge v0.8h, v1.8h, v0.8h
 ; CHECK-FP16-NEXT:    xtn v0.8b, v0.8h
 ; CHECK-FP16-NEXT:    ret
+;
+; CHECK-CVT-GI-LABEL: test_fcmp_ole:
+; CHECK-CVT-GI:       // %bb.0:
+; CHECK-CVT-GI-NEXT:    fcvtl v2.4s, v0.4h
+; CHECK-CVT-GI-NEXT:    fcvtl2 v0.4s, v0.8h
+; CHECK-CVT-GI-NEXT:    fcvtl v3.4s, v1.4h
+; CHECK-CVT-GI-NEXT:    fcvtl2 v1.4s, v1.8h
+; CHECK-CVT-GI-NEXT:    fcmge v2.4s, v3.4s, v2.4s
+; CHECK-CVT-GI-NEXT:    fcmge v0.4s, v1.4s, v0.4s
+; CHECK-CVT-GI-NEXT:    uzp1 v0.8h, v2.8h, v0.8h
+; CHECK-CVT-GI-NEXT:    xtn v0.8b, v0.8h
+; CHECK-CVT-GI-NEXT:    ret
   %1 = fcmp ole <8 x half> %a, %b
   ret <8 x i1> %1
 }
 
 define <8 x i1> @test_fcmp_ord(<8 x half> %a, <8 x half> %b) #0 {
-; CHECK-CVT-LABEL: test_fcmp_ord:
-; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    fcvtl2 v2.4s, v1.8h
-; CHECK-CVT-NEXT:    fcvtl2 v3.4s, v0.8h
-; CHECK-CVT-NEXT:    fcvtl v1.4s, v1.4h
-; CHECK-CVT-NEXT:    fcvtl v0.4s, v0.4h
-; CHECK-CVT-NEXT:    fcmge v4.4s, v3.4s, v2.4s
-; CHECK-CVT-NEXT:    fcmgt v2.4s, v2.4s, v3.4s
-; CHECK-CVT-NEXT:    fcmge v3.4s, v0.4s, v1.4s
-; CHECK-CVT-NEXT:    fcmgt v0.4s, v1.4s, v0.4s
-; CHECK-CVT-NEXT:    orr v1.16b, v2.16b, v4.16b
-; CHECK-CVT-NEXT:    orr v0.16b, v0.16b, v3.16b
-; CHECK-CVT-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
-; CHECK-CVT-NEXT:    xtn v0.8b, v0.8h
-; CHECK-CVT-NEXT:    ret
+; CHECK-CVT-SD-LABEL: test_fcmp_ord:
+; CHECK-CVT-SD:       // %bb.0:
+; CHECK-CVT-SD-NEXT:    fcvtl2 v2.4s, v1.8h
+; CHECK-CVT-SD-NEXT:    fcvtl2 v3.4s, v0.8h
+; CHECK-CVT-SD-NEXT:    fcvtl v1.4s, v1.4h
+; CHECK-CVT-SD-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-CVT-SD-NEXT:    fcmge v4.4s, v3.4s, v2.4s
+; CHECK-CVT-SD-NEXT:    fcmgt v2.4s, v2.4s, v3.4s
+; CHECK-CVT-SD-NEXT:    fcmge v3.4s, v0.4s, v1.4s
+; CHECK-CVT-SD-NEXT:    fcmgt v0.4s, v1.4s, v0.4s
+; CHECK-CVT-SD-NEXT:    orr v1.16b, v2.16b, v4.16b
+; CHECK-CVT-SD-NEXT:    orr v0.16b, v0.16b, v3.16b
+; CHECK-CVT-SD-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
+; CHECK-CVT-SD-NEXT:    xtn v0.8b, v0.8h
+; CHECK-CVT-SD-NEXT:    ret
 ;
 ; CHECK-FP16-LABEL: test_fcmp_ord:
 ; CHECK-FP16:       // %bb.0:
@@ -853,8 +1458,27 @@ define <8 x i1> @test_fcmp_ord(<8 x half> %a, <8 x half> %b) #0 {
 ; CHECK-FP16-NEXT:    orr v0.16b, v0.16b, v2.16b
 ; CHECK-FP16-NEXT:    xtn v0.8b, v0.8h
 ; CHECK-FP16-NEXT:    ret
+;
+; CHECK-CVT-GI-LABEL: test_fcmp_ord:
+; CHECK-CVT-GI:       // %bb.0:
+; CHECK-CVT-GI-NEXT:    fcvtl v2.4s, v0.4h
+; CHECK-CVT-GI-NEXT:    fcvtl v3.4s, v1.4h
+; CHECK-CVT-GI-NEXT:    fcvtl2 v0.4s, v0.8h
+; CHECK-CVT-GI-NEXT:    fcvtl2 v1.4s, v1.8h
+; CHECK-CVT-GI-NEXT:    fcmge v4.4s, v2.4s, v3.4s
+; CHECK-CVT-GI-NEXT:    fcmgt v2.4s, v3.4s, v2.4s
+; CHECK-CVT-GI-NEXT:    fcmge v3.4s, v0.4s, v1.4s
+; CHECK-CVT-GI-NEXT:    fcmgt v0.4s, v1.4s, v0.4s
+; CHECK-CVT-GI-NEXT:    orr v1.16b, v2.16b, v4.16b
+; CHECK-CVT-GI-NEXT:    orr v0.16b, v0.16b, v3.16b
+; CHECK-CVT-GI-NEXT:    uzp1 v0.8h, v1.8h, v0.8h
+; CHECK-CVT-GI-NEXT:    xtn v0.8b, v0.8h
+; CHECK-CVT-GI-NEXT:    ret
   %1 = fcmp ord <8 x half> %a, %b
   ret <8 x i1> %1
 }
 
 attributes #0 = { nounwind }
+
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK-CVT: {{.*}}
diff --git a/llvm/test/CodeGen/AArch64/icmp.ll b/llvm/test/CodeGen/AArch64/icmp.ll
index 18665bc..7195e2b 100644
--- a/llvm/test/CodeGen/AArch64/icmp.ll
+++ b/llvm/test/CodeGen/AArch64/icmp.ll
@@ -2093,3 +2093,54 @@ define <2 x i1> @icmp_slt_v2i64_Zero_LHS(<2 x i64> %a) {
     %c = icmp slt <2 x i64> <i64 0, i64 0>, %a
     ret <2 x i1> %c
 }
+
+; Test TST optimization for i8 sign bit testing with cross-type select
+; This tests the pattern: icmp slt i8 %val, 0; select i1 %cmp, i32 %a, i32 %b
+; The optimization should convert sxtb+cmp to tst for sign bit testing.
+
+define i32 @i8_signbit_tst_constants(i8 %x, i8 %y) {
+; CHECK-SD-LABEL: i8_signbit_tst_constants:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    add w9, w0, w1
+; CHECK-SD-NEXT:    mov w8, #42 // =0x2a
+; CHECK-SD-NEXT:    tst w9, #0x80
+; CHECK-SD-NEXT:    mov w9, #20894 // =0x519e
+; CHECK-SD-NEXT:    csel w0, w9, w8, ne
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: i8_signbit_tst_constants:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    add w8, w0, w1
+; CHECK-GI-NEXT:    mov w9, #42 // =0x2a
+; CHECK-GI-NEXT:    mov w10, #20894 // =0x519e
+; CHECK-GI-NEXT:    sxtb w8, w8
+; CHECK-GI-NEXT:    cmp w8, #0
+; CHECK-GI-NEXT:    csel w0, w10, w9, mi
+; CHECK-GI-NEXT:    ret
+  %add = add i8 %x, %y
+  %cmp = icmp slt i8 %add, 0
+  %sel = select i1 %cmp, i32 20894, i32 42
+  ret i32 %sel
+}
+
+; Test i8 sign bit testing with variable select values (problematic case)
+define i32 @i8_signbit_variables(i8 %x, i8 %y, i32 %a, i32 %b) {
+; CHECK-SD-LABEL: i8_signbit_variables:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    add w8, w0, w1
+; CHECK-SD-NEXT:    tst w8, #0x80
+; CHECK-SD-NEXT:    csel w0, w2, w3, ne
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: i8_signbit_variables:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    add w8, w0, w1
+; CHECK-GI-NEXT:    sxtb w8, w8
+; CHECK-GI-NEXT:    cmp w8, #0
+; CHECK-GI-NEXT:    csel w0, w2, w3, mi
+; CHECK-GI-NEXT:    ret
+  %add = add i8 %x, %y
+  %cmp = icmp slt i8 %add, 0
+  %sel = select i1 %cmp, i32 %a, i32 %b
+  ret i32 %sel
+}
diff --git a/llvm/test/CodeGen/AArch64/sme-za-exceptions.ll b/llvm/test/CodeGen/AArch64/sme-za-exceptions.ll
index fc43c71..b6dee97e 100644
--- a/llvm/test/CodeGen/AArch64/sme-za-exceptions.ll
+++ b/llvm/test/CodeGen/AArch64/sme-za-exceptions.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -aarch64-new-sme-abi -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -aarch64-new-sme-abi -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -verify-machineinstrs < %s | FileCheck %s --check-prefix=CHECK-SDAG
 
 ; A simple EH test case that corresponds to the following C++ source:
 ;
@@ -87,6 +88,90 @@ define void @za_with_raii(i1 %fail) "aarch64_inout_za" personality ptr @__gxx_pe
 ; CHECK-NEXT:    mov x0, x19
 ; CHECK-NEXT:    msr TPIDR2_EL0, x8
 ; CHECK-NEXT:    bl _Unwind_Resume
+;
+; CHECK-SDAG-LABEL: za_with_raii:
+; CHECK-SDAG:       .Lfunc_begin0:
+; CHECK-SDAG-NEXT:    .cfi_startproc
+; CHECK-SDAG-NEXT:    .cfi_personality 156, DW.ref.__gxx_personality_v0
+; CHECK-SDAG-NEXT:    .cfi_lsda 28, .Lexception0
+; CHECK-SDAG-NEXT:  // %bb.0:
+; CHECK-SDAG-NEXT:    stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-SDAG-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
+; CHECK-SDAG-NEXT:    mov x29, sp
+; CHECK-SDAG-NEXT:    sub sp, sp, #16
+; CHECK-SDAG-NEXT:    .cfi_def_cfa w29, 32
+; CHECK-SDAG-NEXT:    .cfi_offset w19, -8
+; CHECK-SDAG-NEXT:    .cfi_offset w20, -16
+; CHECK-SDAG-NEXT:    .cfi_offset w30, -24
+; CHECK-SDAG-NEXT:    .cfi_offset w29, -32
+; CHECK-SDAG-NEXT:    rdsvl x8, #1
+; CHECK-SDAG-NEXT:    mov x9, sp
+; CHECK-SDAG-NEXT:    msub x9, x8, x8, x9
+; CHECK-SDAG-NEXT:    mov sp, x9
+; CHECK-SDAG-NEXT:    stp x9, x8, [x29, #-16]
+; CHECK-SDAG-NEXT:    tbnz w0, #0, .LBB0_2
+; CHECK-SDAG-NEXT:  // %bb.1: // %return_normally
+; CHECK-SDAG-NEXT:    mov sp, x29
+; CHECK-SDAG-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
+; CHECK-SDAG-NEXT:    ldp x29, x30, [sp], #32 // 16-byte Folded Reload
+; CHECK-SDAG-NEXT:    b shared_za_call
+; CHECK-SDAG-NEXT:  .LBB0_2: // %throw_exception
+; CHECK-SDAG-NEXT:    sub x20, x29, #16
+; CHECK-SDAG-NEXT:    mov w0, #8 // =0x8
+; CHECK-SDAG-NEXT:    msr TPIDR2_EL0, x20
+; CHECK-SDAG-NEXT:    bl __cxa_allocate_exception
+; CHECK-SDAG-NEXT:    mov x8, x0
+; CHECK-SDAG-NEXT:    smstart za
+; CHECK-SDAG-NEXT:    mrs x9, TPIDR2_EL0
+; CHECK-SDAG-NEXT:    sub x0, x29, #16
+; CHECK-SDAG-NEXT:    cbnz x9, .LBB0_4
+; CHECK-SDAG-NEXT:  // %bb.3: // %throw_exception
+; CHECK-SDAG-NEXT:    bl __arm_tpidr2_restore
+; CHECK-SDAG-NEXT:  .LBB0_4: // %throw_exception
+; CHECK-SDAG-NEXT:    adrp x9, .L.str
+; CHECK-SDAG-NEXT:    add x9, x9, :lo12:.L.str
+; CHECK-SDAG-NEXT:    msr TPIDR2_EL0, xzr
+; CHECK-SDAG-NEXT:    str x9, [x8]
+; CHECK-SDAG-NEXT:  .Ltmp0: // EH_LABEL
+; CHECK-SDAG-NEXT:    adrp x1, :got:typeinfo_for_char_const_ptr
+; CHECK-SDAG-NEXT:    msr TPIDR2_EL0, x20
+; CHECK-SDAG-NEXT:    mov x0, x8
+; CHECK-SDAG-NEXT:    ldr x1, [x1, :got_lo12:typeinfo_for_char_const_ptr]
+; CHECK-SDAG-NEXT:    mov x2, xzr
+; CHECK-SDAG-NEXT:    bl __cxa_throw
+; CHECK-SDAG-NEXT:    smstart za
+; CHECK-SDAG-NEXT:    mrs x8, TPIDR2_EL0
+; CHECK-SDAG-NEXT:    sub x0, x29, #16
+; CHECK-SDAG-NEXT:    cbnz x8, .LBB0_6
+; CHECK-SDAG-NEXT:  // %bb.5: // %throw_exception
+; CHECK-SDAG-NEXT:    bl __arm_tpidr2_restore
+; CHECK-SDAG-NEXT:  .LBB0_6: // %throw_exception
+; CHECK-SDAG-NEXT:    msr TPIDR2_EL0, xzr
+; CHECK-SDAG-NEXT:  .Ltmp1: // EH_LABEL
+; CHECK-SDAG-NEXT:  // %bb.7: // %throw_fail
+; CHECK-SDAG-NEXT:  .LBB0_8: // %unwind_dtors
+; CHECK-SDAG-NEXT:  .Ltmp2: // EH_LABEL
+; CHECK-SDAG-NEXT:    mov x19, x0
+; CHECK-SDAG-NEXT:    smstart za
+; CHECK-SDAG-NEXT:    mrs x8, TPIDR2_EL0
+; CHECK-SDAG-NEXT:    sub x0, x29, #16
+; CHECK-SDAG-NEXT:    cbnz x8, .LBB0_10
+; CHECK-SDAG-NEXT:  // %bb.9: // %unwind_dtors
+; CHECK-SDAG-NEXT:    bl __arm_tpidr2_restore
+; CHECK-SDAG-NEXT:  .LBB0_10: // %unwind_dtors
+; CHECK-SDAG-NEXT:    msr TPIDR2_EL0, xzr
+; CHECK-SDAG-NEXT:    bl shared_za_call
+; CHECK-SDAG-NEXT:    mov x0, x19
+; CHECK-SDAG-NEXT:    msr TPIDR2_EL0, x20
+; CHECK-SDAG-NEXT:    bl _Unwind_Resume
+; CHECK-SDAG-NEXT:    smstart za
+; CHECK-SDAG-NEXT:    mrs x8, TPIDR2_EL0
+; CHECK-SDAG-NEXT:    sub x0, x29, #16
+; CHECK-SDAG-NEXT:    cbnz x8, .LBB0_12
+; CHECK-SDAG-NEXT:  // %bb.11: // %unwind_dtors
+; CHECK-SDAG-NEXT:    bl __arm_tpidr2_restore
+; CHECK-SDAG-NEXT:  .LBB0_12: // %unwind_dtors
+; CHECK-SDAG-NEXT:    msr TPIDR2_EL0, xzr
   br i1 %fail, label %throw_exception, label %return_normally
 
 throw_exception:
@@ -124,7 +209,7 @@ throw_fail:
 ;     }
 ;     shared_za_call();
 ; }
-define dso_local void @try_catch() "aarch64_inout_za" personality ptr @__gxx_personality_v0 {
+define void @try_catch() "aarch64_inout_za" personality ptr @__gxx_personality_v0 {
 ; CHECK-LABEL: try_catch:
 ; CHECK:       .Lfunc_begin1:
 ; CHECK-NEXT:    .cfi_startproc
@@ -142,11 +227,11 @@ define dso_local void @try_catch() "aarch64_inout_za" personality ptr @__gxx_per
 ; CHECK-NEXT:    msub x9, x8, x8, x9
 ; CHECK-NEXT:    mov sp, x9
 ; CHECK-NEXT:    stp x9, x8, [x29, #-16]
-; CHECK-NEXT:  .Ltmp3:
+; CHECK-NEXT:  .Ltmp3: // EH_LABEL
 ; CHECK-NEXT:    sub x8, x29, #16
 ; CHECK-NEXT:    msr TPIDR2_EL0, x8
 ; CHECK-NEXT:    bl may_throw
-; CHECK-NEXT:  .Ltmp4:
+; CHECK-NEXT:  .Ltmp4: // EH_LABEL
 ; CHECK-NEXT:  .LBB1_1: // %after_catch
 ; CHECK-NEXT:    smstart za
 ; CHECK-NEXT:    mrs x8, TPIDR2_EL0
@@ -160,7 +245,7 @@ define dso_local void @try_catch() "aarch64_inout_za" personality ptr @__gxx_per
 ; CHECK-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
 ; CHECK-NEXT:    b shared_za_call
 ; CHECK-NEXT:  .LBB1_4: // %catch
-; CHECK-NEXT:  .Ltmp5:
+; CHECK-NEXT:  .Ltmp5: // EH_LABEL
 ; CHECK-NEXT:    bl __cxa_begin_catch
 ; CHECK-NEXT:    smstart za
 ; CHECK-NEXT:    mrs x8, TPIDR2_EL0
@@ -175,6 +260,78 @@ define dso_local void @try_catch() "aarch64_inout_za" personality ptr @__gxx_per
 ; CHECK-NEXT:    msr TPIDR2_EL0, x8
 ; CHECK-NEXT:    bl __cxa_end_catch
 ; CHECK-NEXT:    b .LBB1_1
+;
+; CHECK-SDAG-LABEL: try_catch:
+; CHECK-SDAG:       .Lfunc_begin1:
+; CHECK-SDAG-NEXT:    .cfi_startproc
+; CHECK-SDAG-NEXT:    .cfi_personality 156, DW.ref.__gxx_personality_v0
+; CHECK-SDAG-NEXT:    .cfi_lsda 28, .Lexception1
+; CHECK-SDAG-NEXT:  // %bb.0:
+; CHECK-SDAG-NEXT:    stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-SDAG-NEXT:    str x19, [sp, #16] // 8-byte Folded Spill
+; CHECK-SDAG-NEXT:    mov x29, sp
+; CHECK-SDAG-NEXT:    sub sp, sp, #16
+; CHECK-SDAG-NEXT:    .cfi_def_cfa w29, 32
+; CHECK-SDAG-NEXT:    .cfi_offset w19, -16
+; CHECK-SDAG-NEXT:    .cfi_offset w30, -24
+; CHECK-SDAG-NEXT:    .cfi_offset w29, -32
+; CHECK-SDAG-NEXT:    rdsvl x8, #1
+; CHECK-SDAG-NEXT:    mov x9, sp
+; CHECK-SDAG-NEXT:    msub x9, x8, x8, x9
+; CHECK-SDAG-NEXT:    mov sp, x9
+; CHECK-SDAG-NEXT:    stp x9, x8, [x29, #-16]
+; CHECK-SDAG-NEXT:  .Ltmp3: // EH_LABEL
+; CHECK-SDAG-NEXT:    sub x19, x29, #16
+; CHECK-SDAG-NEXT:    msr TPIDR2_EL0, x19
+; CHECK-SDAG-NEXT:    bl may_throw
+; CHECK-SDAG-NEXT:    smstart za
+; CHECK-SDAG-NEXT:    mrs x8, TPIDR2_EL0
+; CHECK-SDAG-NEXT:    sub x0, x29, #16
+; CHECK-SDAG-NEXT:    cbnz x8, .LBB1_2
+; CHECK-SDAG-NEXT:  // %bb.1:
+; CHECK-SDAG-NEXT:    bl __arm_tpidr2_restore
+; CHECK-SDAG-NEXT:  .LBB1_2:
+; CHECK-SDAG-NEXT:    msr TPIDR2_EL0, xzr
+; CHECK-SDAG-NEXT:  .Ltmp4: // EH_LABEL
+; CHECK-SDAG-NEXT:  .LBB1_3: // %after_catch
+; CHECK-SDAG-NEXT:    mov sp, x29
+; CHECK-SDAG-NEXT:    ldr x19, [sp, #16] // 8-byte Folded Reload
+; CHECK-SDAG-NEXT:    ldp x29, x30, [sp], #32 // 16-byte Folded Reload
+; CHECK-SDAG-NEXT:    b shared_za_call
+; CHECK-SDAG-NEXT:  .LBB1_4: // %catch
+; CHECK-SDAG-NEXT:  .Ltmp5: // EH_LABEL
+; CHECK-SDAG-NEXT:    mov x1, x0
+; CHECK-SDAG-NEXT:    smstart za
+; CHECK-SDAG-NEXT:    mrs x8, TPIDR2_EL0
+; CHECK-SDAG-NEXT:    sub x0, x29, #16
+; CHECK-SDAG-NEXT:    cbnz x8, .LBB1_6
+; CHECK-SDAG-NEXT:  // %bb.5: // %catch
+; CHECK-SDAG-NEXT:    bl __arm_tpidr2_restore
+; CHECK-SDAG-NEXT:  .LBB1_6: // %catch
+; CHECK-SDAG-NEXT:    mov x0, x1
+; CHECK-SDAG-NEXT:    msr TPIDR2_EL0, xzr
+; CHECK-SDAG-NEXT:    msr TPIDR2_EL0, x19
+; CHECK-SDAG-NEXT:    bl __cxa_begin_catch
+; CHECK-SDAG-NEXT:    smstart za
+; CHECK-SDAG-NEXT:    mrs x8, TPIDR2_EL0
+; CHECK-SDAG-NEXT:    sub x0, x29, #16
+; CHECK-SDAG-NEXT:    cbnz x8, .LBB1_8
+; CHECK-SDAG-NEXT:  // %bb.7: // %catch
+; CHECK-SDAG-NEXT:    bl __arm_tpidr2_restore
+; CHECK-SDAG-NEXT:  .LBB1_8: // %catch
+; CHECK-SDAG-NEXT:    msr TPIDR2_EL0, xzr
+; CHECK-SDAG-NEXT:    bl shared_za_call
+; CHECK-SDAG-NEXT:    msr TPIDR2_EL0, x19
+; CHECK-SDAG-NEXT:    bl __cxa_end_catch
+; CHECK-SDAG-NEXT:    smstart za
+; CHECK-SDAG-NEXT:    mrs x8, TPIDR2_EL0
+; CHECK-SDAG-NEXT:    sub x0, x29, #16
+; CHECK-SDAG-NEXT:    cbnz x8, .LBB1_10
+; CHECK-SDAG-NEXT:  // %bb.9: // %catch
+; CHECK-SDAG-NEXT:    bl __arm_tpidr2_restore
+; CHECK-SDAG-NEXT:  .LBB1_10: // %catch
+; CHECK-SDAG-NEXT:    msr TPIDR2_EL0, xzr
+; CHECK-SDAG-NEXT:    b .LBB1_3
   invoke void @may_throw()
           to label %after_catch unwind label %catch
 
@@ -235,16 +392,16 @@ define void @try_catch_shared_za_callee() "aarch64_new_za" personality ptr @__gx
 ; CHECK-NEXT:    zero {za}
 ; CHECK-NEXT:  .LBB2_2:
 ; CHECK-NEXT:    smstart za
-; CHECK-NEXT:  .Ltmp6:
+; CHECK-NEXT:  .Ltmp6: // EH_LABEL
 ; CHECK-NEXT:    bl shared_za_call
-; CHECK-NEXT:  .Ltmp7:
+; CHECK-NEXT:  .Ltmp7: // EH_LABEL
 ; CHECK-NEXT:  .LBB2_3: // %exit
 ; CHECK-NEXT:    smstop za
 ; CHECK-NEXT:    mov sp, x29
 ; CHECK-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
 ; CHECK-NEXT:    ret
 ; CHECK-NEXT:  .LBB2_4: // %catch
-; CHECK-NEXT:  .Ltmp8:
+; CHECK-NEXT:  .Ltmp8: // EH_LABEL
 ; CHECK-NEXT:    bl __cxa_begin_catch
 ; CHECK-NEXT:    smstart za
 ; CHECK-NEXT:    mrs x8, TPIDR2_EL0
@@ -260,6 +417,78 @@ define void @try_catch_shared_za_callee() "aarch64_new_za" personality ptr @__gx
 ; CHECK-NEXT:    bl __cxa_end_catch
 ; CHECK-NEXT:    msr TPIDR2_EL0, xzr
 ; CHECK-NEXT:    b .LBB2_3
+;
+; CHECK-SDAG-LABEL: try_catch_shared_za_callee:
+; CHECK-SDAG:       .Lfunc_begin2:
+; CHECK-SDAG-NEXT:    .cfi_startproc
+; CHECK-SDAG-NEXT:    .cfi_personality 156, DW.ref.__gxx_personality_v0
+; CHECK-SDAG-NEXT:    .cfi_lsda 28, .Lexception2
+; CHECK-SDAG-NEXT:  // %bb.0: // %prelude
+; CHECK-SDAG-NEXT:    stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-SDAG-NEXT:    str x19, [sp, #16] // 8-byte Folded Spill
+; CHECK-SDAG-NEXT:    mov x29, sp
+; CHECK-SDAG-NEXT:    sub sp, sp, #16
+; CHECK-SDAG-NEXT:    .cfi_def_cfa w29, 32
+; CHECK-SDAG-NEXT:    .cfi_offset w19, -16
+; CHECK-SDAG-NEXT:    .cfi_offset w30, -24
+; CHECK-SDAG-NEXT:    .cfi_offset w29, -32
+; CHECK-SDAG-NEXT:    rdsvl x8, #1
+; CHECK-SDAG-NEXT:    mov x9, sp
+; CHECK-SDAG-NEXT:    msub x9, x8, x8, x9
+; CHECK-SDAG-NEXT:    mov sp, x9
+; CHECK-SDAG-NEXT:    stp x9, x8, [x29, #-16]
+; CHECK-SDAG-NEXT:    mrs x8, TPIDR2_EL0
+; CHECK-SDAG-NEXT:    cbz x8, .LBB2_2
+; CHECK-SDAG-NEXT:  // %bb.1: // %save.za
+; CHECK-SDAG-NEXT:    bl __arm_tpidr2_save
+; CHECK-SDAG-NEXT:    msr TPIDR2_EL0, xzr
+; CHECK-SDAG-NEXT:  .LBB2_2:
+; CHECK-SDAG-NEXT:    smstart za
+; CHECK-SDAG-NEXT:    zero {za}
+; CHECK-SDAG-NEXT:  .Ltmp6: // EH_LABEL
+; CHECK-SDAG-NEXT:    bl shared_za_call
+; CHECK-SDAG-NEXT:  .Ltmp7: // EH_LABEL
+; CHECK-SDAG-NEXT:  .LBB2_3: // %exit
+; CHECK-SDAG-NEXT:    smstop za
+; CHECK-SDAG-NEXT:    mov sp, x29
+; CHECK-SDAG-NEXT:    ldr x19, [sp, #16] // 8-byte Folded Reload
+; CHECK-SDAG-NEXT:    ldp x29, x30, [sp], #32 // 16-byte Folded Reload
+; CHECK-SDAG-NEXT:    ret
+; CHECK-SDAG-NEXT:  .LBB2_4: // %catch
+; CHECK-SDAG-NEXT:  .Ltmp8: // EH_LABEL
+; CHECK-SDAG-NEXT:    mov x1, x0
+; CHECK-SDAG-NEXT:    smstart za
+; CHECK-SDAG-NEXT:    mrs x8, TPIDR2_EL0
+; CHECK-SDAG-NEXT:    sub x0, x29, #16
+; CHECK-SDAG-NEXT:    sub x19, x29, #16
+; CHECK-SDAG-NEXT:    cbnz x8, .LBB2_6
+; CHECK-SDAG-NEXT:  // %bb.5: // %catch
+; CHECK-SDAG-NEXT:    bl __arm_tpidr2_restore
+; CHECK-SDAG-NEXT:  .LBB2_6: // %catch
+; CHECK-SDAG-NEXT:    mov x0, x1
+; CHECK-SDAG-NEXT:    msr TPIDR2_EL0, xzr
+; CHECK-SDAG-NEXT:    msr TPIDR2_EL0, x19
+; CHECK-SDAG-NEXT:    bl __cxa_begin_catch
+; CHECK-SDAG-NEXT:    smstart za
+; CHECK-SDAG-NEXT:    mrs x8, TPIDR2_EL0
+; CHECK-SDAG-NEXT:    sub x0, x29, #16
+; CHECK-SDAG-NEXT:    cbnz x8, .LBB2_8
+; CHECK-SDAG-NEXT:  // %bb.7: // %catch
+; CHECK-SDAG-NEXT:    bl __arm_tpidr2_restore
+; CHECK-SDAG-NEXT:  .LBB2_8: // %catch
+; CHECK-SDAG-NEXT:    msr TPIDR2_EL0, xzr
+; CHECK-SDAG-NEXT:    bl noexcept_shared_za_call
+; CHECK-SDAG-NEXT:    msr TPIDR2_EL0, x19
+; CHECK-SDAG-NEXT:    bl __cxa_end_catch
+; CHECK-SDAG-NEXT:    smstart za
+; CHECK-SDAG-NEXT:    mrs x8, TPIDR2_EL0
+; CHECK-SDAG-NEXT:    sub x0, x29, #16
+; CHECK-SDAG-NEXT:    cbnz x8, .LBB2_10
+; CHECK-SDAG-NEXT:  // %bb.9: // %catch
+; CHECK-SDAG-NEXT:    bl __arm_tpidr2_restore
+; CHECK-SDAG-NEXT:  .LBB2_10: // %catch
+; CHECK-SDAG-NEXT:    msr TPIDR2_EL0, xzr
+; CHECK-SDAG-NEXT:    b .LBB2_3
   invoke void @shared_za_call() #4
           to label %exit unwind label %catch
 catch:
@@ -275,6 +504,234 @@ exit:
   ret void
 }
 
+; A simple ZT0 exception example that corresponds to:
+;
+; struct ZT0Resource {
+;     ~ZT0Resource() __arm_inout("zt0") {
+;         shared_zt0_call(); // simulate cleanup in destructor
+;     }
+; };
+;
+; void za_with_raii() __arm_inout("zt0") {
+;     ZT0Resource r;
+;     may_throw();
+; }
+;
+; This code may require reloading ZT0 in the cleanup for ~ZT0Resource().
+;
+; FIXME: Codegen with `-aarch64-new-sme-abi` is broken with ZT0 (as it is not implemented).
+define void @try_catch_shared_zt0_callee() "aarch64_inout_zt0" personality ptr @__gxx_personality_v0 {
+; CHECK-LABEL: try_catch_shared_zt0_callee:
+; CHECK:       .Lfunc_begin3:
+; CHECK-NEXT:    .cfi_startproc
+; CHECK-NEXT:    .cfi_personality 156, DW.ref.__gxx_personality_v0
+; CHECK-NEXT:    .cfi_lsda 28, .Lexception3
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    mov x29, sp
+; CHECK-NEXT:    sub sp, sp, #80
+; CHECK-NEXT:    .cfi_def_cfa w29, 32
+; CHECK-NEXT:    .cfi_offset w19, -8
+; CHECK-NEXT:    .cfi_offset w20, -16
+; CHECK-NEXT:    .cfi_offset w30, -24
+; CHECK-NEXT:    .cfi_offset w29, -32
+; CHECK-NEXT:    rdsvl x8, #1
+; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    msub x9, x8, x8, x9
+; CHECK-NEXT:    mov sp, x9
+; CHECK-NEXT:    stp x9, x8, [x29, #-80]
+; CHECK-NEXT:  .Ltmp9: // EH_LABEL
+; CHECK-NEXT:    sub x19, x29, #64
+; CHECK-NEXT:    str zt0, [x19]
+; CHECK-NEXT:    smstop za
+; CHECK-NEXT:    bl may_throw
+; CHECK-NEXT:    smstart za
+; CHECK-NEXT:    ldr zt0, [x19]
+; CHECK-NEXT:  .Ltmp10: // EH_LABEL
+; CHECK-NEXT:  // %bb.1: // %return_normally
+; CHECK-NEXT:    mov sp, x29
+; CHECK-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp x29, x30, [sp], #32 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB3_2: // %unwind_dtors
+; CHECK-NEXT:  .Ltmp11: // EH_LABEL
+; CHECK-NEXT:    sub x20, x29, #64
+; CHECK-NEXT:    mov x19, x0
+; CHECK-NEXT:    smstart za
+; CHECK-NEXT:    mrs x8, TPIDR2_EL0
+; CHECK-NEXT:    sub x0, x29, #80
+; CHECK-NEXT:    cbnz x8, .LBB3_4
+; CHECK-NEXT:  // %bb.3: // %unwind_dtors
+; CHECK-NEXT:    bl __arm_tpidr2_restore
+; CHECK-NEXT:  .LBB3_4: // %unwind_dtors
+; CHECK-NEXT:    msr TPIDR2_EL0, xzr
+; CHECK-NEXT:    bl shared_zt0_call
+; CHECK-NEXT:    str zt0, [x20]
+; CHECK-NEXT:    smstop za
+; CHECK-NEXT:    mov x0, x19
+; CHECK-NEXT:    bl _Unwind_Resume
+; CHECK-NEXT:    smstart za
+; CHECK-NEXT:    ldr zt0, [x20]
+;
+; CHECK-SDAG-LABEL: try_catch_shared_zt0_callee:
+; CHECK-SDAG:       .Lfunc_begin3:
+; CHECK-SDAG-NEXT:    .cfi_startproc
+; CHECK-SDAG-NEXT:    .cfi_personality 156, DW.ref.__gxx_personality_v0
+; CHECK-SDAG-NEXT:    .cfi_lsda 28, .Lexception3
+; CHECK-SDAG-NEXT:  // %bb.0:
+; CHECK-SDAG-NEXT:    sub sp, sp, #96
+; CHECK-SDAG-NEXT:    str x30, [sp, #64] // 8-byte Folded Spill
+; CHECK-SDAG-NEXT:    stp x20, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-SDAG-NEXT:    .cfi_def_cfa_offset 96
+; CHECK-SDAG-NEXT:    .cfi_offset w19, -8
+; CHECK-SDAG-NEXT:    .cfi_offset w20, -16
+; CHECK-SDAG-NEXT:    .cfi_offset w30, -32
+; CHECK-SDAG-NEXT:  .Ltmp9: // EH_LABEL
+; CHECK-SDAG-NEXT:    mov x19, sp
+; CHECK-SDAG-NEXT:    str zt0, [x19]
+; CHECK-SDAG-NEXT:    smstop za
+; CHECK-SDAG-NEXT:    bl may_throw
+; CHECK-SDAG-NEXT:    smstart za
+; CHECK-SDAG-NEXT:    ldr zt0, [x19]
+; CHECK-SDAG-NEXT:  .Ltmp10: // EH_LABEL
+; CHECK-SDAG-NEXT:  // %bb.1: // %return_normally
+; CHECK-SDAG-NEXT:    ldp x20, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-SDAG-NEXT:    ldr x30, [sp, #64] // 8-byte Folded Reload
+; CHECK-SDAG-NEXT:    add sp, sp, #96
+; CHECK-SDAG-NEXT:    ret
+; CHECK-SDAG-NEXT:  .LBB3_2: // %unwind_dtors
+; CHECK-SDAG-NEXT:  .Ltmp11: // EH_LABEL
+; CHECK-SDAG-NEXT:    mov x20, sp
+; CHECK-SDAG-NEXT:    mov x19, x0
+; CHECK-SDAG-NEXT:    smstart za
+; CHECK-SDAG-NEXT:    ldr zt0, [x20]
+; CHECK-SDAG-NEXT:    bl shared_zt0_call
+; CHECK-SDAG-NEXT:    str zt0, [x20]
+; CHECK-SDAG-NEXT:    smstop za
+; CHECK-SDAG-NEXT:    mov x0, x19
+; CHECK-SDAG-NEXT:    bl _Unwind_Resume
+; CHECK-SDAG-NEXT:    smstart za
+; CHECK-SDAG-NEXT:    ldr zt0, [x20]
+  invoke void @may_throw()
+          to label %return_normally unwind label %unwind_dtors
+
+unwind_dtors:
+  %5 = landingpad { ptr, i32 }
+          cleanup
+  tail call void @shared_zt0_call()
+  resume { ptr, i32 } %5
+
+return_normally:
+  ret void
+}
+
+; This example corresponds to:
+;
+; __arm_agnostic("sme_za_state") void try_catch_agnostic_za()
+; {
+;    try {
+;        may_throw();
+;    } catch(...) {
+;    }
+; }
+;
+; In this example we must execute __arm_sme_restore once we enter the catch block
+; (before executing __arm_sme_save again, which would invalidate the prior save).
+define void @try_catch_agnostic_za() "aarch64_za_state_agnostic" personality ptr @__gxx_personality_v0 {
+; CHECK-LABEL: try_catch_agnostic_za:
+; CHECK:       .Lfunc_begin4:
+; CHECK-NEXT:    .cfi_startproc
+; CHECK-NEXT:    .cfi_personality 156, DW.ref.__gxx_personality_v0
+; CHECK-NEXT:    .cfi_lsda 28, .Lexception4
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-NEXT:    str x19, [sp, #16] // 8-byte Folded Spill
+; CHECK-NEXT:    mov x29, sp
+; CHECK-NEXT:    .cfi_def_cfa w29, 32
+; CHECK-NEXT:    .cfi_offset w19, -16
+; CHECK-NEXT:    .cfi_offset w30, -24
+; CHECK-NEXT:    .cfi_offset w29, -32
+; CHECK-NEXT:    bl __arm_sme_state_size
+; CHECK-NEXT:    sub sp, sp, x0
+; CHECK-NEXT:    mov x19, sp
+; CHECK-NEXT:  .Ltmp12: // EH_LABEL
+; CHECK-NEXT:    mov x0, x19
+; CHECK-NEXT:    bl __arm_sme_save
+; CHECK-NEXT:    bl may_throw
+; CHECK-NEXT:  .Ltmp13: // EH_LABEL
+; CHECK-NEXT:  .LBB4_1: // %exit
+; CHECK-NEXT:    mov x0, x19
+; CHECK-NEXT:    bl __arm_sme_restore
+; CHECK-NEXT:    mov sp, x29
+; CHECK-NEXT:    ldr x19, [sp, #16] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp x29, x30, [sp], #32 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB4_2: // %catch
+; CHECK-NEXT:  .Ltmp14: // EH_LABEL
+; CHECK-NEXT:    bl __cxa_begin_catch
+; CHECK-NEXT:    bl __cxa_end_catch
+; CHECK-NEXT:    b .LBB4_1
+;
+; CHECK-SDAG-LABEL: try_catch_agnostic_za:
+; CHECK-SDAG:       .Lfunc_begin4:
+; CHECK-SDAG-NEXT:    .cfi_startproc
+; CHECK-SDAG-NEXT:    .cfi_personality 156, DW.ref.__gxx_personality_v0
+; CHECK-SDAG-NEXT:    .cfi_lsda 28, .Lexception4
+; CHECK-SDAG-NEXT:  // %bb.0:
+; CHECK-SDAG-NEXT:    stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-SDAG-NEXT:    str x19, [sp, #16] // 8-byte Folded Spill
+; CHECK-SDAG-NEXT:    mov x29, sp
+; CHECK-SDAG-NEXT:    .cfi_def_cfa w29, 32
+; CHECK-SDAG-NEXT:    .cfi_offset w19, -16
+; CHECK-SDAG-NEXT:    .cfi_offset w30, -24
+; CHECK-SDAG-NEXT:    .cfi_offset w29, -32
+; CHECK-SDAG-NEXT:    bl __arm_sme_state_size
+; CHECK-SDAG-NEXT:    sub sp, sp, x0
+; CHECK-SDAG-NEXT:    mov x19, sp
+; CHECK-SDAG-NEXT:  .Ltmp12: // EH_LABEL
+; CHECK-SDAG-NEXT:    mov x0, x19
+; CHECK-SDAG-NEXT:    bl __arm_sme_save
+; CHECK-SDAG-NEXT:    bl may_throw
+; CHECK-SDAG-NEXT:    mov x0, x19
+; CHECK-SDAG-NEXT:    bl __arm_sme_restore
+; CHECK-SDAG-NEXT:  .Ltmp13: // EH_LABEL
+; CHECK-SDAG-NEXT:  .LBB4_1: // %exit
+; CHECK-SDAG-NEXT:    mov sp, x29
+; CHECK-SDAG-NEXT:    ldr x19, [sp, #16] // 8-byte Folded Reload
+; CHECK-SDAG-NEXT:    ldp x29, x30, [sp], #32 // 16-byte Folded Reload
+; CHECK-SDAG-NEXT:    ret
+; CHECK-SDAG-NEXT:  .LBB4_2: // %catch
+; CHECK-SDAG-NEXT:  .Ltmp14: // EH_LABEL
+; CHECK-SDAG-NEXT:    mov x1, x0
+; CHECK-SDAG-NEXT:    mov x0, x19
+; CHECK-SDAG-NEXT:    bl __arm_sme_restore
+; CHECK-SDAG-NEXT:    mov x0, x19
+; CHECK-SDAG-NEXT:    bl __arm_sme_save
+; CHECK-SDAG-NEXT:    mov x0, x1
+; CHECK-SDAG-NEXT:    bl __cxa_begin_catch
+; CHECK-SDAG-NEXT:    mov x0, x19
+; CHECK-SDAG-NEXT:    bl __arm_sme_restore
+; CHECK-SDAG-NEXT:    mov x0, x19
+; CHECK-SDAG-NEXT:    bl __arm_sme_save
+; CHECK-SDAG-NEXT:    bl __cxa_end_catch
+; CHECK-SDAG-NEXT:    mov x0, x19
+; CHECK-SDAG-NEXT:    bl __arm_sme_restore
+; CHECK-SDAG-NEXT:    b .LBB4_1
+  invoke void @may_throw()
+          to label %exit unwind label %catch
+catch:
+  %eh_info = landingpad { ptr, i32 }
+          catch ptr null
+  %exception_ptr = extractvalue { ptr, i32 } %eh_info, 0
+  tail call ptr @__cxa_begin_catch(ptr %exception_ptr)
+  tail call void @__cxa_end_catch()
+  br label %exit
+
+exit:
+  ret void
+}
+
 declare ptr @__cxa_allocate_exception(i64)
 declare void @__cxa_throw(ptr, ptr, ptr)
 declare ptr @__cxa_begin_catch(ptr)
@@ -284,3 +741,4 @@ declare i32 @__gxx_personality_v0(...)
 declare void @may_throw()
 declare void @shared_za_call() "aarch64_inout_za"
 declare void @noexcept_shared_za_call() "aarch64_inout_za"
+declare void @shared_zt0_call() "aarch64_inout_zt0"
diff --git a/llvm/test/CodeGen/AArch64/strict-fp-opt.ll b/llvm/test/CodeGen/AArch64/strict-fp-opt.ll
index bb7cd22..c433291 100644
--- a/llvm/test/CodeGen/AArch64/strict-fp-opt.ll
+++ b/llvm/test/CodeGen/AArch64/strict-fp-opt.ll
@@ -1,31 +1,40 @@
-; RUN: llc -mtriple=aarch64 %s -o - | FileCheck %s
-; RUN: llc -mtriple=aarch64 -global-isel=true -global-isel-abort=2 %s -o - | FileCheck %s
-
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=aarch64 %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD
+; RUN: llc -mtriple=aarch64 -global-isel=true -global-isel-abort=2 %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
+
+; CHECK-GI:       warning: Instruction selection used fallback path for unused_div_fpexcept_strict
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for unused_div_round_dynamic
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for add_twice_fpexcept_strict
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for add_twice_round_dynamic
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for set_rounding
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for set_rounding_fpexcept_strict
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for set_rounding_round_dynamic
 
 ; Div whose result is unused should be removed unless we have strict exceptions
 
-; CHECK-LABEL: unused_div:
-; CHECK-NOT: fdiv
-; CHECK: ret
 define void @unused_div(float %x, float %y) {
+; CHECK-LABEL: unused_div:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ret
 entry:
   %add = fdiv float %x, %y
   ret void
 }
 
-; CHECK-LABEL: unused_div_fpexcept_strict:
-; CHECK: fdiv s0, s0, s1
-; CHECK-NEXT: ret
 define void @unused_div_fpexcept_strict(float %x, float %y) #0 {
+; CHECK-LABEL: unused_div_fpexcept_strict:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fdiv s0, s0, s1
+; CHECK-NEXT:    ret
 entry:
   %add = call float @llvm.experimental.constrained.fdiv.f32(float %x, float %y, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
   ret void
 }
 
-; CHECK-LABEL: unused_div_round_dynamic:
-; CHECK-NOT: fdiv
-; CHECK: ret
 define void @unused_div_round_dynamic(float %x, float %y) #0 {
+; CHECK-LABEL: unused_div_round_dynamic:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ret
 entry:
   %add = call float @llvm.experimental.constrained.fdiv.f32(float %x, float %y, metadata !"round.dynamic", metadata !"fpexcept.ignore") #0
   ret void
@@ -33,14 +42,14 @@ entry:
 
 
 ; Machine CSE should eliminate the second add unless we have strict exceptions
-
-; CHECK-LABEL: add_twice:
-; CHECK: fadd [[ADD:s[0-9]+]], s0, s1
-; CHECK-NEXT: cmp w0, #0
-; CHECK-NEXT: fmul [[MUL:s[0-9]+]], [[ADD]], [[ADD]]
-; CHECK-NEXT: fcsel s0, [[ADD]], [[MUL]], eq
-; CHECK-NEXT: ret
 define float @add_twice(float %x, float %y, i32 %n) {
+; CHECK-LABEL: add_twice:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fadd s0, s0, s1
+; CHECK-NEXT:    cmp w0, #0
+; CHECK-NEXT:    fmul s1, s0, s0
+; CHECK-NEXT:    fcsel s0, s0, s1, eq
+; CHECK-NEXT:    ret
 entry:
   %add = fadd float %x, %y
   %tobool.not = icmp eq i32 %n, 0
@@ -56,15 +65,17 @@ if.end:
   ret float %a.0
 }
 
-; CHECK-LABEL: add_twice_fpexcept_strict:
-; CHECK: fmov [[X:s[0-9]+]], s0
-; CHECK-NEXT: fadd s0, s0, s1
-; CHECK-NEXT: cbz w0, [[LABEL:.LBB[0-9_]+]]
-; CHECK: fadd [[ADD:s[0-9]+]], [[X]], s1
-; CHECK-NEXT: fmul s0, s0, [[ADD]]
-; CHECK: [[LABEL]]:
-; CHECK-NEXT: ret
 define float @add_twice_fpexcept_strict(float %x, float %y, i32 %n) #0 {
+; CHECK-LABEL: add_twice_fpexcept_strict:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmov s2, s0
+; CHECK-NEXT:    fadd s0, s0, s1
+; CHECK-NEXT:    cbz w0, .LBB4_2
+; CHECK-NEXT:  // %bb.1: // %if.then
+; CHECK-NEXT:    fadd s1, s2, s1
+; CHECK-NEXT:    fmul s0, s0, s1
+; CHECK-NEXT:  .LBB4_2: // %if.end
+; CHECK-NEXT:    ret
 entry:
   %add = call float @llvm.experimental.constrained.fadd.f32(float %x, float %y, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
   %tobool.not = icmp eq i32 %n, 0
@@ -80,14 +91,15 @@ if.end:
   ret float %a.0
 }
 
-; CHECK-LABEL: add_twice_round_dynamic:
-; CHECK: fadd s0, s0, s1
-; CHECK-NEXT: cbz w0, [[LABEL:.LBB[0-9_]+]]
-; CHECK-NOT: fadd
-; CHECK: fmul s0, s0, s0
-; CHECK: [[LABEL]]:
-; CHECK-NEXT: ret
 define float @add_twice_round_dynamic(float %x, float %y, i32 %n) #0 {
+; CHECK-LABEL: add_twice_round_dynamic:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fadd s0, s0, s1
+; CHECK-NEXT:    cbz w0, .LBB5_2
+; CHECK-NEXT:  // %bb.1: // %if.then
+; CHECK-NEXT:    fmul s0, s0, s0
+; CHECK-NEXT:  .LBB5_2: // %if.end
+; CHECK-NEXT:    ret
 entry:
   %add = call float @llvm.experimental.constrained.fadd.f32(float %x, float %y, metadata !"round.dynamic", metadata !"fpexcept.ignore") #0
   %tobool.not = icmp eq i32 %n, 0
@@ -108,17 +120,18 @@ if.end:
 ; dynamic (as they may give different results) or when we have strict exceptions
 ; (the llvm.set.rounding is irrelevant, but both could trap).
 
-; CHECK-LABEL: set_rounding:
-; CHECK-DAG: fadd [[SREG:s[0-9]+]], s0, s1
-; CHECK-DAG: mrs [[XREG1:x[0-9]+]], FPCR
-; CHECK-DAG: orr [[XREG2:x[0-9]+]], [[XREG1]], #0xc00000
-; CHECK: msr FPCR, [[XREG2]]
-; CHECK-NEXT: mrs [[XREG3:x[0-9]+]], FPCR
-; CHECK-NEXT: and [[XREG4:x[0-9]+]], [[XREG3]], #0xffffffffff3fffff
-; CHECK-NEXT: msr FPCR, [[XREG4]]
-; CHECK-NEXT: fsub s0, [[SREG]], [[SREG]]
-; CHECK-NEXT: ret
 define float @set_rounding(float %x, float %y) {
+; CHECK-LABEL: set_rounding:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mrs x8, FPCR
+; CHECK-NEXT:    fadd s0, s0, s1
+; CHECK-NEXT:    orr x8, x8, #0xc00000
+; CHECK-NEXT:    msr FPCR, x8
+; CHECK-NEXT:    mrs x8, FPCR
+; CHECK-NEXT:    and x8, x8, #0xffffffffff3fffff
+; CHECK-NEXT:    msr FPCR, x8
+; CHECK-NEXT:    fsub s0, s0, s0
+; CHECK-NEXT:    ret
 entry:
   %add1 = fadd float %x, %y
   call void @llvm.set.rounding(i32 0)
@@ -128,18 +141,19 @@ entry:
   ret float %sub
 }
 
-; CHECK-LABEL: set_rounding_fpexcept_strict:
-; CHECK-DAG: fadd [[SREG1:s[0-9]+]], s0, s1
-; CHECK-DAG: mrs [[XREG1:x[0-9]+]], FPCR
-; CHECK-DAG: orr [[XREG2:x[0-9]+]], [[XREG1]], #0xc00000
-; CHECK: msr FPCR, [[XREG2]]
-; CHECK-DAG: fadd [[SREG2:s[0-9]+]], s0, s1
-; CHECK-DAG: mrs [[XREG3:x[0-9]+]], FPCR
-; CHECK-DAG: and [[XREG4:x[0-9]+]], [[XREG3]], #0xffffffffff3fffff
-; CHECK-NEXT: msr FPCR, [[XREG4]]
-; CHECK-NEXT: fsub s0, [[SREG1]], [[SREG2]]
-; CHECK-NEXT: ret
 define float @set_rounding_fpexcept_strict(float %x, float %y) #0 {
+; CHECK-LABEL: set_rounding_fpexcept_strict:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fadd s2, s0, s1
+; CHECK-NEXT:    mrs x8, FPCR
+; CHECK-NEXT:    orr x8, x8, #0xc00000
+; CHECK-NEXT:    msr FPCR, x8
+; CHECK-NEXT:    fadd s0, s0, s1
+; CHECK-NEXT:    mrs x8, FPCR
+; CHECK-NEXT:    and x8, x8, #0xffffffffff3fffff
+; CHECK-NEXT:    msr FPCR, x8
+; CHECK-NEXT:    fsub s0, s2, s0
+; CHECK-NEXT:    ret
 entry:
   %add1 = call float @llvm.experimental.constrained.fadd.f32(float %x, float %y, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
   call void @llvm.set.rounding(i32 0) #0
@@ -149,18 +163,19 @@ entry:
   ret float %sub
 }
 
-; CHECK-LABEL: set_rounding_round_dynamic:
-; CHECK-DAG: fadd [[SREG1:s[0-9]+]], s0, s1
-; CHECK-DAG: mrs [[XREG1:x[0-9]+]], FPCR
-; CHECK-DAG: orr [[XREG2:x[0-9]+]], [[XREG1]], #0xc00000
-; CHECK: msr FPCR, [[XREG2]]
-; CHECK-DAG: fadd [[SREG2:s[0-9]+]], s0, s1
-; CHECK-DAG: mrs [[XREG3:x[0-9]+]], FPCR
-; CHECK-DAG: and [[XREG4:x[0-9]+]], [[XREG3]], #0xffffffffff3fffff
-; CHECK-NEXT: msr FPCR, [[XREG4]]
-; CHECK-NEXT: fsub s0, [[SREG1]], [[SREG2]]
-; CHECK-NEXT: ret
 define float @set_rounding_round_dynamic(float %x, float %y) #0 {
+; CHECK-LABEL: set_rounding_round_dynamic:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mrs x8, FPCR
+; CHECK-NEXT:    fadd s2, s0, s1
+; CHECK-NEXT:    orr x8, x8, #0xc00000
+; CHECK-NEXT:    msr FPCR, x8
+; CHECK-NEXT:    fadd s0, s0, s1
+; CHECK-NEXT:    mrs x8, FPCR
+; CHECK-NEXT:    and x8, x8, #0xffffffffff3fffff
+; CHECK-NEXT:    msr FPCR, x8
+; CHECK-NEXT:    fsub s0, s2, s0
+; CHECK-NEXT:    ret
 entry:
   %add1 = call float @llvm.experimental.constrained.fadd.f32(float %x, float %y, metadata !"round.dynamic", metadata !"fpexcept.ignore") #0
   call void @llvm.set.rounding(i32 0) #0
@@ -178,3 +193,6 @@ declare i32 @llvm.get.rounding()
 declare void @llvm.set.rounding(i32)
 
 attributes #0 = { strictfp }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK-GI: {{.*}}
+; CHECK-SD: {{.*}}
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-bool.ll b/llvm/test/CodeGen/AArch64/vecreduce-bool.ll
index 62d41fc..19e1aa5 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-bool.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-bool.ll
@@ -26,9 +26,9 @@ define i32 @reduce_and_v1i8(<1 x i8> %a0, i32 %a1, i32 %a2) nounwind {
 ; CHECK-LABEL: reduce_and_v1i8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    smov w8, v0.b[0]
-; CHECK-NEXT:    cmp w8, #0
-; CHECK-NEXT:    csel w0, w0, w1, mi
+; CHECK-NEXT:    umov w8, v0.b[0]
+; CHECK-NEXT:    tst w8, #0x80
+; CHECK-NEXT:    csel w0, w0, w1, ne
 ; CHECK-NEXT:    ret
   %x = icmp slt <1 x i8> %a0, zeroinitializer
   %y = call i1 @llvm.vector.reduce.and.v1i1(<1 x i1> %x)
@@ -120,9 +120,9 @@ define i32 @reduce_and_v1i16(<1 x i16> %a0, i32 %a1, i32 %a2) nounwind {
 ; CHECK-LABEL: reduce_and_v1i16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    smov w8, v0.h[0]
-; CHECK-NEXT:    cmp w8, #0
-; CHECK-NEXT:    csel w0, w0, w1, mi
+; CHECK-NEXT:    umov w8, v0.h[0]
+; CHECK-NEXT:    tst w8, #0x8000
+; CHECK-NEXT:    csel w0, w0, w1, ne
 ; CHECK-NEXT:    ret
   %x = icmp slt <1 x i16> %a0, zeroinitializer
   %y = call i1 @llvm.vector.reduce.and.v1i1(<1 x i1> %x)
@@ -305,9 +305,9 @@ define i32 @reduce_or_v1i8(<1 x i8> %a0, i32 %a1, i32 %a2) nounwind {
 ; CHECK-LABEL: reduce_or_v1i8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    smov w8, v0.b[0]
-; CHECK-NEXT:    cmp w8, #0
-; CHECK-NEXT:    csel w0, w0, w1, mi
+; CHECK-NEXT:    umov w8, v0.b[0]
+; CHECK-NEXT:    tst w8, #0x80
+; CHECK-NEXT:    csel w0, w0, w1, ne
 ; CHECK-NEXT:    ret
   %x = icmp slt <1 x i8> %a0, zeroinitializer
   %y = call i1 @llvm.vector.reduce.or.v1i1(<1 x i1> %x)
@@ -399,9 +399,9 @@ define i32 @reduce_or_v1i16(<1 x i16> %a0, i32 %a1, i32 %a2) nounwind {
 ; CHECK-LABEL: reduce_or_v1i16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    smov w8, v0.h[0]
-; CHECK-NEXT:    cmp w8, #0
-; CHECK-NEXT:    csel w0, w0, w1, mi
+; CHECK-NEXT:    umov w8, v0.h[0]
+; CHECK-NEXT:    tst w8, #0x8000
+; CHECK-NEXT:    csel w0, w0, w1, ne
 ; CHECK-NEXT:    ret
   %x = icmp slt <1 x i16> %a0, zeroinitializer
   %y = call i1 @llvm.vector.reduce.or.v1i1(<1 x i1> %x)
@@ -584,9 +584,9 @@ define i32 @reduce_xor_v1i8(<1 x i8> %a0, i32 %a1, i32 %a2) nounwind {
 ; CHECK-LABEL: reduce_xor_v1i8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    smov w8, v0.b[0]
-; CHECK-NEXT:    cmp w8, #0
-; CHECK-NEXT:    csel w0, w0, w1, mi
+; CHECK-NEXT:    umov w8, v0.b[0]
+; CHECK-NEXT:    tst w8, #0x80
+; CHECK-NEXT:    csel w0, w0, w1, ne
 ; CHECK-NEXT:    ret
   %x = icmp slt <1 x i8> %a0, zeroinitializer
   %y = call i1 @llvm.vector.reduce.xor.v1i1(<1 x i1> %x)
@@ -679,9 +679,9 @@ define i32 @reduce_xor_v1i16(<1 x i16> %a0, i32 %a1, i32 %a2) nounwind {
 ; CHECK-LABEL: reduce_xor_v1i16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    smov w8, v0.h[0]
-; CHECK-NEXT:    cmp w8, #0
-; CHECK-NEXT:    csel w0, w0, w1, mi
+; CHECK-NEXT:    umov w8, v0.h[0]
+; CHECK-NEXT:    tst w8, #0x8000
+; CHECK-NEXT:    csel w0, w0, w1, ne
 ; CHECK-NEXT:    ret
   %x = icmp slt <1 x i16> %a0, zeroinitializer
   %y = call i1 @llvm.vector.reduce.xor.v1i1(<1 x i1> %x)
diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index 0490e5a..94ba5cd 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -10908,12 +10908,13 @@ define <2 x bfloat> @v_fadd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
 ; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11FAKE16-NEXT:    v_perm_b32 v0, v0, v1, 0x7060302
 ; GFX11FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX1250-LABEL: v_fadd_v2bf16:
 ; GFX1250:       ; %bb.0:
-; GFX1250-NEXT:   s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT:   s_wait_kmcnt 0x0
-; GFX1250-NEXT:   v_pk_add_bf16 v0, v0, v1
-; GFX1250-NEXT:   s_set_pc_i64 s[30:31]
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_pk_add_bf16 v0, v0, v1
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
   %op = fadd <2 x bfloat> %a, %b
   ret <2 x bfloat> %op
 }
@@ -11446,13 +11447,14 @@ define <4 x bfloat> @v_fadd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
 ; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11FAKE16-NEXT:    v_perm_b32 v1, v1, v4, 0x7060302
 ; GFX11FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX1250-LABEL: v_fadd_v4bf16:
 ; GFX1250:       ; %bb.0:
-; GFX1250-NEXT:   s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT:   s_wait_kmcnt 0x0
-; GFX1250-NEXT:   v_pk_add_bf16 v0, v0, v2
-; GFX1250-NEXT:   v_pk_add_bf16 v1, v1, v3
-; GFX1250-NEXT:   s_set_pc_i64 s[30:31]
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_pk_add_bf16 v0, v0, v2
+; GFX1250-NEXT:    v_pk_add_bf16 v1, v1, v3
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
   %op = fadd <4 x bfloat> %a, %b
   ret <4 x bfloat> %op
 }
@@ -49991,6 +49993,622 @@ define <4 x bfloat> @v_fma_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat>
   ret <4 x bfloat> %op
 }
 
+define <8 x bfloat> @v_fma_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b, <8 x bfloat> %c) {
+; GCN-LABEL: v_fma_v8bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_mul_f32_e32 v7, 1.0, v7
+; GCN-NEXT:    v_mul_f32_e32 v15, 1.0, v15
+; GCN-NEXT:    v_mul_f32_e32 v23, 1.0, v23
+; GCN-NEXT:    v_mul_f32_e32 v6, 1.0, v6
+; GCN-NEXT:    v_mul_f32_e32 v14, 1.0, v14
+; GCN-NEXT:    v_mul_f32_e32 v22, 1.0, v22
+; GCN-NEXT:    v_mul_f32_e32 v5, 1.0, v5
+; GCN-NEXT:    v_mul_f32_e32 v13, 1.0, v13
+; GCN-NEXT:    v_mul_f32_e32 v21, 1.0, v21
+; GCN-NEXT:    v_mul_f32_e32 v4, 1.0, v4
+; GCN-NEXT:    v_mul_f32_e32 v12, 1.0, v12
+; GCN-NEXT:    v_mul_f32_e32 v20, 1.0, v20
+; GCN-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GCN-NEXT:    v_mul_f32_e32 v11, 1.0, v11
+; GCN-NEXT:    v_mul_f32_e32 v19, 1.0, v19
+; GCN-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GCN-NEXT:    v_mul_f32_e32 v10, 1.0, v10
+; GCN-NEXT:    v_mul_f32_e32 v18, 1.0, v18
+; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT:    v_mul_f32_e32 v8, 1.0, v8
+; GCN-NEXT:    v_mul_f32_e32 v16, 1.0, v16
+; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT:    v_mul_f32_e32 v9, 1.0, v9
+; GCN-NEXT:    v_mul_f32_e32 v17, 1.0, v17
+; GCN-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GCN-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GCN-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GCN-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GCN-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GCN-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GCN-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
+; GCN-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GCN-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GCN-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GCN-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; GCN-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GCN-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_fma_f32 v7, v7, v15, v23
+; GCN-NEXT:    v_fma_f32 v6, v6, v14, v22
+; GCN-NEXT:    v_fma_f32 v5, v5, v13, v21
+; GCN-NEXT:    v_fma_f32 v4, v4, v12, v20
+; GCN-NEXT:    v_fma_f32 v3, v3, v11, v19
+; GCN-NEXT:    v_fma_f32 v2, v2, v10, v18
+; GCN-NEXT:    v_fma_f32 v1, v1, v9, v17
+; GCN-NEXT:    v_fma_f32 v0, v0, v8, v16
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GCN-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fma_v8bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mul_f32_e32 v7, 1.0, v7
+; GFX7-NEXT:    v_mul_f32_e32 v15, 1.0, v15
+; GFX7-NEXT:    v_mul_f32_e32 v23, 1.0, v23
+; GFX7-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GFX7-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT:    v_fma_f32 v7, v7, v15, v23
+; GFX7-NEXT:    v_mul_f32_e32 v6, 1.0, v6
+; GFX7-NEXT:    v_mul_f32_e32 v14, 1.0, v14
+; GFX7-NEXT:    v_mul_f32_e32 v15, 1.0, v22
+; GFX7-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX7-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT:    v_fma_f32 v6, v6, v14, v15
+; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
+; GFX7-NEXT:    v_mul_f32_e32 v13, 1.0, v13
+; GFX7-NEXT:    v_mul_f32_e32 v14, 1.0, v21
+; GFX7-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX7-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT:    v_fma_f32 v5, v5, v13, v14
+; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT:    v_mul_f32_e32 v12, 1.0, v12
+; GFX7-NEXT:    v_mul_f32_e32 v13, 1.0, v20
+; GFX7-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX7-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT:    v_fma_f32 v4, v4, v12, v13
+; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT:    v_mul_f32_e32 v11, 1.0, v11
+; GFX7-NEXT:    v_mul_f32_e32 v12, 1.0, v19
+; GFX7-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX7-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT:    v_fma_f32 v3, v3, v11, v12
+; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT:    v_mul_f32_e32 v10, 1.0, v10
+; GFX7-NEXT:    v_mul_f32_e32 v11, 1.0, v18
+; GFX7-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX7-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_fma_f32 v2, v2, v10, v11
+; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT:    v_mul_f32_e32 v9, 1.0, v9
+; GFX7-NEXT:    v_mul_f32_e32 v11, 1.0, v17
+; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT:    v_mul_f32_e32 v8, 1.0, v8
+; GFX7-NEXT:    v_mul_f32_e32 v10, 1.0, v16
+; GFX7-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX7-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_fma_f32 v1, v1, v9, v11
+; GFX7-NEXT:    v_and_b32_e32 v9, 0xffff0000, v10
+; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_fma_f32 v0, v0, v8, v9
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fma_v8bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v11
+; GFX8-NEXT:    v_lshlrev_b32_e32 v13, 16, v7
+; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v3
+; GFX8-NEXT:    v_fma_f32 v12, v14, v13, v12
+; GFX8-NEXT:    v_bfe_u32 v13, v12, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v13, vcc, v13, v12
+; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX8-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX8-NEXT:    v_add_u32_e32 v13, vcc, s4, v13
+; GFX8-NEXT:    v_fma_f32 v3, v3, v7, v11
+; GFX8-NEXT:    v_or_b32_e32 v14, 0x400000, v12
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v12, v12
+; GFX8-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX8-NEXT:    v_cndmask_b32_e32 v12, v13, v14, vcc
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v3
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, s4, v7
+; GFX8-NEXT:    v_or_b32_e32 v11, 0x400000, v3
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v7, v11, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v10
+; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 16, v6
+; GFX8-NEXT:    v_lshlrev_b32_e32 v13, 16, v2
+; GFX8-NEXT:    v_fma_f32 v7, v13, v11, v7
+; GFX8-NEXT:    v_bfe_u32 v11, v7, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v11, vcc, v11, v7
+; GFX8-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT:    v_add_u32_e32 v11, vcc, s4, v11
+; GFX8-NEXT:    v_fma_f32 v2, v2, v6, v10
+; GFX8-NEXT:    v_or_b32_e32 v13, 0x400000, v7
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
+; GFX8-NEXT:    v_bfe_u32 v6, v2, 16, 1
+; GFX8-NEXT:    v_cndmask_b32_e32 v7, v11, v13, vcc
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v2
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, s4, v6
+; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v2
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v6, v10, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v9
+; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
+; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
+; GFX8-NEXT:    v_fma_f32 v6, v11, v10, v6
+; GFX8-NEXT:    v_bfe_u32 v10, v6, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v10, vcc, v10, v6
+; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT:    v_add_u32_e32 v10, vcc, s4, v10
+; GFX8-NEXT:    v_fma_f32 v1, v1, v5, v9
+; GFX8-NEXT:    v_or_b32_e32 v11, 0x400000, v6
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX8-NEXT:    v_bfe_u32 v5, v1, 16, 1
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v10, v11, vcc
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v1
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, s4, v5
+; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v1
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v9, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v8
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v4
+; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v0
+; GFX8-NEXT:    v_fma_f32 v5, v10, v9, v5
+; GFX8-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v5
+; GFX8-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, s4, v9
+; GFX8-NEXT:    v_fma_f32 v0, v0, v4, v8
+; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT:    v_bfe_u32 v4, v0, 16, 1
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v0
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x7fff, v4
+; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v8, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_alignbit_b32 v0, v0, v5, 16
+; GFX8-NEXT:    v_alignbit_b32 v1, v1, v6, 16
+; GFX8-NEXT:    v_alignbit_b32 v2, v2, v7, 16
+; GFX8-NEXT:    v_alignbit_b32 v3, v3, v12, 16
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-LABEL: v_fma_v8bf16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v12, 16, v11
+; GFX900-NEXT:    v_lshlrev_b32_e32 v13, 16, v7
+; GFX900-NEXT:    v_lshlrev_b32_e32 v14, 16, v3
+; GFX900-NEXT:    v_fma_f32 v12, v14, v13, v12
+; GFX900-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX900-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX900-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX900-NEXT:    v_bfe_u32 v13, v12, 16, 1
+; GFX900-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX900-NEXT:    v_fma_f32 v3, v3, v7, v11
+; GFX900-NEXT:    v_add3_u32 v13, v13, v12, s4
+; GFX900-NEXT:    v_or_b32_e32 v14, 0x400000, v12
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v12, v12
+; GFX900-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX900-NEXT:    v_cndmask_b32_e32 v12, v13, v14, vcc
+; GFX900-NEXT:    v_add3_u32 v7, v7, v3, s4
+; GFX900-NEXT:    v_or_b32_e32 v11, 0x400000, v3
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; GFX900-NEXT:    v_cndmask_b32_e32 v3, v7, v11, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v7, 16, v10
+; GFX900-NEXT:    v_lshlrev_b32_e32 v11, 16, v6
+; GFX900-NEXT:    v_lshlrev_b32_e32 v13, 16, v2
+; GFX900-NEXT:    v_fma_f32 v7, v13, v11, v7
+; GFX900-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX900-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX900-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX900-NEXT:    v_bfe_u32 v11, v7, 16, 1
+; GFX900-NEXT:    v_fma_f32 v2, v2, v6, v10
+; GFX900-NEXT:    v_add3_u32 v11, v11, v7, s4
+; GFX900-NEXT:    v_or_b32_e32 v13, 0x400000, v7
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
+; GFX900-NEXT:    v_bfe_u32 v6, v2, 16, 1
+; GFX900-NEXT:    v_cndmask_b32_e32 v7, v11, v13, vcc
+; GFX900-NEXT:    v_add3_u32 v6, v6, v2, s4
+; GFX900-NEXT:    v_or_b32_e32 v10, 0x400000, v2
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; GFX900-NEXT:    v_cndmask_b32_e32 v2, v6, v10, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v6, 16, v9
+; GFX900-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
+; GFX900-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
+; GFX900-NEXT:    v_fma_f32 v6, v11, v10, v6
+; GFX900-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX900-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX900-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX900-NEXT:    v_bfe_u32 v10, v6, 16, 1
+; GFX900-NEXT:    v_fma_f32 v1, v1, v5, v9
+; GFX900-NEXT:    v_add3_u32 v10, v10, v6, s4
+; GFX900-NEXT:    v_or_b32_e32 v11, 0x400000, v6
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX900-NEXT:    v_bfe_u32 v5, v1, 16, 1
+; GFX900-NEXT:    v_cndmask_b32_e32 v6, v10, v11, vcc
+; GFX900-NEXT:    v_add3_u32 v5, v5, v1, s4
+; GFX900-NEXT:    v_or_b32_e32 v9, 0x400000, v1
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v5, v9, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v8
+; GFX900-NEXT:    v_lshlrev_b32_e32 v9, 16, v4
+; GFX900-NEXT:    v_lshlrev_b32_e32 v10, 16, v0
+; GFX900-NEXT:    v_fma_f32 v5, v10, v9, v5
+; GFX900-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX900-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX900-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX900-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX900-NEXT:    v_fma_f32 v0, v0, v4, v8
+; GFX900-NEXT:    v_add3_u32 v9, v9, v5, s4
+; GFX900-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX900-NEXT:    v_bfe_u32 v4, v0, 16, 1
+; GFX900-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX900-NEXT:    v_add3_u32 v4, v4, v0, s4
+; GFX900-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v4, v8, vcc
+; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX900-NEXT:    v_perm_b32 v0, v0, v5, s4
+; GFX900-NEXT:    v_perm_b32 v1, v1, v6, s4
+; GFX900-NEXT:    v_perm_b32 v2, v2, v7, s4
+; GFX900-NEXT:    v_perm_b32 v3, v3, v12, s4
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fma_v8bf16:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_and_b32_e32 v12, 0xffff0000, v11
+; GFX950-NEXT:    v_and_b32_e32 v13, 0xffff0000, v7
+; GFX950-NEXT:    v_and_b32_e32 v14, 0xffff0000, v3
+; GFX950-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX950-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX950-NEXT:    v_fmac_f32_e32 v12, v14, v13
+; GFX950-NEXT:    v_fmac_f32_e32 v11, v3, v7
+; GFX950-NEXT:    v_and_b32_e32 v3, 0xffff0000, v10
+; GFX950-NEXT:    v_and_b32_e32 v7, 0xffff0000, v6
+; GFX950-NEXT:    v_and_b32_e32 v13, 0xffff0000, v2
+; GFX950-NEXT:    v_fmac_f32_e32 v3, v13, v7
+; GFX950-NEXT:    v_lshlrev_b32_e32 v7, 16, v10
+; GFX950-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX950-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX950-NEXT:    v_fmac_f32_e32 v7, v2, v6
+; GFX950-NEXT:    v_and_b32_e32 v2, 0xffff0000, v9
+; GFX950-NEXT:    v_and_b32_e32 v6, 0xffff0000, v5
+; GFX950-NEXT:    v_and_b32_e32 v10, 0xffff0000, v1
+; GFX950-NEXT:    v_fmac_f32_e32 v2, v10, v6
+; GFX950-NEXT:    v_lshlrev_b32_e32 v6, 16, v9
+; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX950-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX950-NEXT:    v_fmac_f32_e32 v6, v1, v5
+; GFX950-NEXT:    v_and_b32_e32 v1, 0xffff0000, v8
+; GFX950-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX950-NEXT:    v_and_b32_e32 v9, 0xffff0000, v0
+; GFX950-NEXT:    v_fmac_f32_e32 v1, v9, v5
+; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v8
+; GFX950-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX950-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT:    v_fmac_f32_e32 v5, v0, v4
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v0, v5, v1
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v1, v6, v2
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v2, v7, v3
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v3, v11, v12
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fma_v8bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v11
+; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v7
+; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v3
+; GFX10-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v0
+; GFX10-NEXT:    v_fmac_f32_e32 v12, v14, v13
+; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX10-NEXT:    v_fmac_f32_e32 v11, v3, v7
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v10
+; GFX10-NEXT:    v_bfe_u32 v13, v12, 16, 1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v6
+; GFX10-NEXT:    v_or_b32_e32 v15, 0x400000, v12
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX10-NEXT:    v_add3_u32 v13, v13, v12, 0x7fff
+; GFX10-NEXT:    v_fmac_f32_e32 v3, v14, v7
+; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v10
+; GFX10-NEXT:    v_bfe_u32 v16, v11, 16, 1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v10, v13, v15, vcc_lo
+; GFX10-NEXT:    v_bfe_u32 v13, v3, 16, 1
+; GFX10-NEXT:    v_fmac_f32_e32 v7, v2, v6
+; GFX10-NEXT:    v_add3_u32 v12, v16, v11, 0x7fff
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v9
+; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
+; GFX10-NEXT:    v_add3_u32 v13, v13, v3, 0x7fff
+; GFX10-NEXT:    v_or_b32_e32 v15, 0x400000, v3
+; GFX10-NEXT:    v_bfe_u32 v16, v7, 16, 1
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX10-NEXT:    v_fmac_f32_e32 v2, v14, v6
+; GFX10-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX10-NEXT:    v_add3_u32 v6, v16, v7, 0x7fff
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v13, v15, vcc_lo
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v8
+; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v4
+; GFX10-NEXT:    v_or_b32_e32 v13, 0x400000, v7
+; GFX10-NEXT:    v_bfe_u32 v14, v2, 16, 1
+; GFX10-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX10-NEXT:    v_fmac_f32_e32 v9, v1, v5
+; GFX10-NEXT:    v_fmac_f32_e32 v15, v18, v16
+; GFX10-NEXT:    v_or_b32_e32 v1, 0x400000, v2
+; GFX10-NEXT:    v_fmac_f32_e32 v8, v0, v4
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v6, v13, vcc_lo
+; GFX10-NEXT:    v_add3_u32 v0, v14, v2, 0x7fff
+; GFX10-NEXT:    v_bfe_u32 v4, v9, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v5, v15, 16, 1
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX10-NEXT:    v_bfe_u32 v7, v8, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v13, 0x400000, v9
+; GFX10-NEXT:    v_or_b32_e32 v17, 0x400000, v11
+; GFX10-NEXT:    v_add3_u32 v2, v5, v15, 0x7fff
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc_lo
+; GFX10-NEXT:    v_add3_u32 v0, v4, v9, 0x7fff
+; GFX10-NEXT:    v_or_b32_e32 v4, 0x400000, v15
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX10-NEXT:    v_add3_u32 v5, v7, v8, 0x7fff
+; GFX10-NEXT:    v_or_b32_e32 v7, 0x400000, v8
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v5, v7, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v0, v13, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX10-NEXT:    v_perm_b32 v0, v4, v2, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v2, v6, v3, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v1, v5, v1, 0x7060302
+; GFX10-NEXT:    v_cndmask_b32_e32 v7, v12, v17, vcc_lo
+; GFX10-NEXT:    v_perm_b32 v3, v7, v10, 0x7060302
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11TRUE16-LABEL: v_fma_v8bf16:
+; GFX11TRUE16:       ; %bb.0:
+; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff0000, v11
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v13, 0xffff0000, v7
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff0000, v6
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v16, 0xffff0000, v2
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v14, 0xffff0000, v3
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11TRUE16-NEXT:    v_fmac_f32_e32 v11, v3, v7
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v10
+; GFX11TRUE16-NEXT:    v_fmac_f32_e32 v7, v2, v6
+; GFX11TRUE16-NEXT:    v_fmac_f32_e32 v12, v14, v13
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v11
+; GFX11TRUE16-NEXT:    v_bfe_u32 v13, v12, 16, 1
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v17, 0x400000, v12
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11TRUE16-NEXT:    v_add3_u32 v13, v13, v12, 0x7fff
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v3, v13, v17, vcc_lo
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v17, 0xffff0000, v1
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v14, 0xffff0000, v10
+; GFX11TRUE16-NEXT:    v_bfe_u32 v10, v11, 16, 1
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11TRUE16-NEXT:    v_bfe_u32 v13, v7, 16, 1
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11TRUE16-NEXT:    v_fmac_f32_e32 v14, v16, v15
+; GFX11TRUE16-NEXT:    v_add3_u32 v2, v10, v11, 0x7fff
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v16, 0xffff0000, v5
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v11, 0x400000, v7
+; GFX11TRUE16-NEXT:    v_bfe_u32 v15, v14, 16, 1
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v6, v2, v6, vcc_lo
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11TRUE16-NEXT:    v_add3_u32 v10, v15, v14, 0x7fff
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff0000, v9
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v12, 0x400000, v14
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v4
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v3.l, v6.h
+; GFX11TRUE16-NEXT:    v_fmac_f32_e32 v9, v1, v5
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v2, v10, v12, vcc_lo
+; GFX11TRUE16-NEXT:    v_add3_u32 v10, v13, v7, 0x7fff
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11TRUE16-NEXT:    v_bfe_u32 v7, v9, 16, 1
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v8
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11TRUE16-NEXT:    v_dual_cndmask_b32 v4, v10, v11 :: v_dual_and_b32 v5, 0xffff0000, v8
+; GFX11TRUE16-NEXT:    v_add3_u32 v7, v7, v9, 0x7fff
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v10, 0x400000, v9
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v2.l, v4.h
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v7, v7, v10, vcc_lo
+; GFX11TRUE16-NEXT:    v_fmac_f32_e32 v15, v17, v16
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v0
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11TRUE16-NEXT:    v_bfe_u32 v12, v15, 16, 1
+; GFX11TRUE16-NEXT:    v_fmac_f32_e32 v13, v16, v14
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11TRUE16-NEXT:    v_fmac_f32_e32 v5, v0, v1
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v1, 0x400000, v15
+; GFX11TRUE16-NEXT:    v_add3_u32 v8, v12, v15, 0x7fff
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11TRUE16-NEXT:    v_bfe_u32 v0, v13, 16, 1
+; GFX11TRUE16-NEXT:    v_bfe_u32 v11, v5, 16, 1
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v12, 0x400000, v13
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX11TRUE16-NEXT:    v_add3_u32 v0, v0, v13, 0x7fff
+; GFX11TRUE16-NEXT:    v_add3_u32 v9, v11, v5, 0x7fff
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v11, v0, v12, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v1, v8, v1, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v1.l, v7.h
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v0, v9, v10, vcc_lo
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v0.l, v11.h
+; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11FAKE16-LABEL: v_fma_v8bf16:
+; GFX11FAKE16:       ; %bb.0:
+; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v11
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v7
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v3
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v0
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11FAKE16-NEXT:    v_dual_fmac_f32 v12, v14, v13 :: v_dual_and_b32 v3, 0xffff0000, v3
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v2
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11FAKE16-NEXT:    v_bfe_u32 v13, v12, 16, 1
+; GFX11FAKE16-NEXT:    v_fmac_f32_e32 v11, v3, v7
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v6
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v15, 0x400000, v12
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11FAKE16-NEXT:    v_add3_u32 v13, v13, v12, 0x7fff
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v10
+; GFX11FAKE16-NEXT:    v_bfe_u32 v16, v11, 16, 1
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v17, 0x400000, v11
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11FAKE16-NEXT:    v_fmac_f32_e32 v3, v14, v7
+; GFX11FAKE16-NEXT:    v_dual_cndmask_b32 v10, v13, v15 :: v_dual_and_b32 v7, 0xffff0000, v10
+; GFX11FAKE16-NEXT:    v_add3_u32 v12, v16, v11, 0x7fff
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v1
+; GFX11FAKE16-NEXT:    v_bfe_u32 v13, v3, 16, 1
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v15, 0x400000, v3
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11FAKE16-NEXT:    v_add3_u32 v13, v13, v3, 0x7fff
+; GFX11FAKE16-NEXT:    v_dual_cndmask_b32 v3, v13, v15 :: v_dual_and_b32 v2, 0xffff0000, v2
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11FAKE16-NEXT:    v_dual_fmac_f32 v7, v2, v6 :: v_dual_lshlrev_b32 v6, 16, v5
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v8
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v9
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX11FAKE16-NEXT:    v_bfe_u32 v16, v7, 16, 1
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v13, 0x400000, v7
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11FAKE16-NEXT:    v_dual_fmac_f32 v2, v14, v6 :: v_dual_and_b32 v5, 0xffff0000, v5
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11FAKE16-NEXT:    v_add3_u32 v6, v16, v7, 0x7fff
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11FAKE16-NEXT:    v_dual_fmac_f32 v9, v1, v5 :: v_dual_and_b32 v8, 0xffff0000, v8
+; GFX11FAKE16-NEXT:    v_bfe_u32 v14, v2, 16, 1
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v1, 0x400000, v2
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v6, v6, v13, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v4
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v13, 0x400000, v9
+; GFX11FAKE16-NEXT:    v_fmac_f32_e32 v8, v0, v4
+; GFX11FAKE16-NEXT:    v_add3_u32 v0, v14, v2, 0x7fff
+; GFX11FAKE16-NEXT:    v_bfe_u32 v4, v9, 16, 1
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11FAKE16-NEXT:    v_bfe_u32 v7, v8, 16, 1
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc_lo
+; GFX11FAKE16-NEXT:    v_fmac_f32_e32 v15, v18, v16
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11FAKE16-NEXT:    v_add3_u32 v0, v4, v9, 0x7fff
+; GFX11FAKE16-NEXT:    v_bfe_u32 v5, v15, 16, 1
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v4, 0x400000, v15
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11FAKE16-NEXT:    v_add3_u32 v2, v5, v15, 0x7fff
+; GFX11FAKE16-NEXT:    v_add3_u32 v5, v7, v8, 0x7fff
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v7, 0x400000, v8
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v4, v5, v7, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v5, v0, v13, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11FAKE16-NEXT:    v_perm_b32 v0, v4, v2, 0x7060302
+; GFX11FAKE16-NEXT:    v_perm_b32 v2, v6, v3, 0x7060302
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11FAKE16-NEXT:    v_perm_b32 v1, v5, v1, 0x7060302
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v7, v12, v17, vcc_lo
+; GFX11FAKE16-NEXT:    v_perm_b32 v3, v7, v10, 0x7060302
+; GFX11FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX1250-LABEL: v_fma_v8bf16:
 ; GFX1250:       ; %bb.0:
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -50000,11 +50618,1239 @@ define <4 x bfloat> @v_fma_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat>
 ; GFX1250-NEXT:    v_pk_fma_bf16 v2, v2, v6, v10
 ; GFX1250-NEXT:    v_pk_fma_bf16 v3, v3, v7, v11
 ; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
-define <8 x bfloat> @v_fma_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b, <8 x bfloat> %c) {
   %op = call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %a, <8 x bfloat> %b, <8 x bfloat> %c)
   ret <8 x bfloat> %op
 }
 
+define <16 x bfloat> @v_fma_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b, <16 x bfloat> %c) {
+; GCN-LABEL: v_fma_v16bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:64
+; GCN-NEXT:    v_mul_f32_e32 v15, 1.0, v15
+; GCN-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_mul_f32_e32 v31, 1.0, v31
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GCN-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
+; GCN-NEXT:    v_fma_f32 v15, v15, v31, v32
+; GCN-NEXT:    v_mul_f32_e32 v14, 1.0, v14
+; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:60
+; GCN-NEXT:    v_mul_f32_e32 v30, 1.0, v30
+; GCN-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
+; GCN-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mul_f32_e32 v31, 1.0, v31
+; GCN-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
+; GCN-NEXT:    v_fma_f32 v14, v14, v30, v31
+; GCN-NEXT:    v_mul_f32_e32 v13, 1.0, v13
+; GCN-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:56
+; GCN-NEXT:    v_mul_f32_e32 v29, 1.0, v29
+; GCN-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; GCN-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mul_f32_e32 v30, 1.0, v30
+; GCN-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
+; GCN-NEXT:    v_fma_f32 v13, v13, v29, v30
+; GCN-NEXT:    v_mul_f32_e32 v12, 1.0, v12
+; GCN-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:52
+; GCN-NEXT:    v_mul_f32_e32 v28, 1.0, v28
+; GCN-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
+; GCN-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mul_f32_e32 v29, 1.0, v29
+; GCN-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; GCN-NEXT:    v_fma_f32 v12, v12, v28, v29
+; GCN-NEXT:    v_mul_f32_e32 v11, 1.0, v11
+; GCN-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:48
+; GCN-NEXT:    v_mul_f32_e32 v27, 1.0, v27
+; GCN-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
+; GCN-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mul_f32_e32 v28, 1.0, v28
+; GCN-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
+; GCN-NEXT:    v_fma_f32 v11, v11, v27, v28
+; GCN-NEXT:    v_mul_f32_e32 v10, 1.0, v10
+; GCN-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:44
+; GCN-NEXT:    v_mul_f32_e32 v26, 1.0, v26
+; GCN-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
+; GCN-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mul_f32_e32 v27, 1.0, v27
+; GCN-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
+; GCN-NEXT:    v_fma_f32 v10, v10, v26, v27
+; GCN-NEXT:    v_mul_f32_e32 v9, 1.0, v9
+; GCN-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:40
+; GCN-NEXT:    v_mul_f32_e32 v25, 1.0, v25
+; GCN-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
+; GCN-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mul_f32_e32 v26, 1.0, v26
+; GCN-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
+; GCN-NEXT:    v_fma_f32 v9, v9, v25, v26
+; GCN-NEXT:    v_mul_f32_e32 v8, 1.0, v8
+; GCN-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:36
+; GCN-NEXT:    v_mul_f32_e32 v24, 1.0, v24
+; GCN-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; GCN-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mul_f32_e32 v25, 1.0, v25
+; GCN-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
+; GCN-NEXT:    v_fma_f32 v8, v8, v24, v25
+; GCN-NEXT:    v_mul_f32_e32 v7, 1.0, v7
+; GCN-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:32
+; GCN-NEXT:    v_mul_f32_e32 v23, 1.0, v23
+; GCN-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mul_f32_e32 v24, 1.0, v24
+; GCN-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; GCN-NEXT:    v_fma_f32 v7, v7, v23, v24
+; GCN-NEXT:    v_mul_f32_e32 v6, 1.0, v6
+; GCN-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:28
+; GCN-NEXT:    v_mul_f32_e32 v22, 1.0, v22
+; GCN-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GCN-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mul_f32_e32 v23, 1.0, v23
+; GCN-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GCN-NEXT:    v_fma_f32 v6, v6, v22, v23
+; GCN-NEXT:    v_mul_f32_e32 v5, 1.0, v5
+; GCN-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:24
+; GCN-NEXT:    v_mul_f32_e32 v21, 1.0, v21
+; GCN-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mul_f32_e32 v22, 1.0, v22
+; GCN-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GCN-NEXT:    v_fma_f32 v5, v5, v21, v22
+; GCN-NEXT:    v_mul_f32_e32 v4, 1.0, v4
+; GCN-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:20
+; GCN-NEXT:    v_mul_f32_e32 v20, 1.0, v20
+; GCN-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
+; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mul_f32_e32 v21, 1.0, v21
+; GCN-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GCN-NEXT:    v_fma_f32 v4, v4, v20, v21
+; GCN-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GCN-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:16
+; GCN-NEXT:    v_mul_f32_e32 v19, 1.0, v19
+; GCN-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mul_f32_e32 v20, 1.0, v20
+; GCN-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
+; GCN-NEXT:    v_fma_f32 v3, v3, v19, v20
+; GCN-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:12
+; GCN-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GCN-NEXT:    v_mul_f32_e32 v18, 1.0, v18
+; GCN-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:4
+; GCN-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_mul_f32_e32 v19, 1.0, v19
+; GCN-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GCN-NEXT:    v_fma_f32 v2, v2, v18, v19
+; GCN-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:8
+; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT:    v_mul_f32_e32 v16, 1.0, v16
+; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT:    v_mul_f32_e32 v17, 1.0, v17
+; GCN-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_mul_f32_e32 v19, 1.0, v20
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mul_f32_e32 v18, 1.0, v18
+; GCN-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GCN-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GCN-NEXT:    v_fma_f32 v1, v1, v17, v18
+; GCN-NEXT:    v_fma_f32 v0, v0, v16, v19
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GCN-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GCN-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GCN-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GCN-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GCN-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GCN-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GCN-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GCN-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fma_v16bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:64
+; GFX7-NEXT:    v_mul_f32_e32 v15, 1.0, v15
+; GFX7-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX7-NEXT:    v_mul_f32_e32 v14, 1.0, v14
+; GFX7-NEXT:    v_mul_f32_e32 v30, 1.0, v30
+; GFX7-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
+; GFX7-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX7-NEXT:    v_mul_f32_e32 v13, 1.0, v13
+; GFX7-NEXT:    v_mul_f32_e32 v29, 1.0, v29
+; GFX7-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; GFX7-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX7-NEXT:    v_mul_f32_e32 v12, 1.0, v12
+; GFX7-NEXT:    v_mul_f32_e32 v28, 1.0, v28
+; GFX7-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
+; GFX7-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX7-NEXT:    v_mul_f32_e32 v11, 1.0, v11
+; GFX7-NEXT:    v_mul_f32_e32 v27, 1.0, v27
+; GFX7-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
+; GFX7-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX7-NEXT:    v_mul_f32_e32 v10, 1.0, v10
+; GFX7-NEXT:    v_mul_f32_e32 v26, 1.0, v26
+; GFX7-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
+; GFX7-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX7-NEXT:    v_mul_f32_e32 v9, 1.0, v9
+; GFX7-NEXT:    v_mul_f32_e32 v25, 1.0, v25
+; GFX7-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
+; GFX7-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX7-NEXT:    v_mul_f32_e32 v8, 1.0, v8
+; GFX7-NEXT:    v_mul_f32_e32 v24, 1.0, v24
+; GFX7-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX7-NEXT:    v_mul_f32_e32 v7, 1.0, v7
+; GFX7-NEXT:    v_mul_f32_e32 v23, 1.0, v23
+; GFX7-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT:    v_mul_f32_e32 v6, 1.0, v6
+; GFX7-NEXT:    v_mul_f32_e32 v22, 1.0, v22
+; GFX7-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
+; GFX7-NEXT:    v_mul_f32_e32 v21, 1.0, v21
+; GFX7-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT:    v_mul_f32_e32 v20, 1.0, v20
+; GFX7-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT:    v_mul_f32_e32 v19, 1.0, v19
+; GFX7-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT:    v_mul_f32_e32 v18, 1.0, v18
+; GFX7-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT:    v_mul_f32_e32 v17, 1.0, v17
+; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT:    v_mul_f32_e32 v16, 1.0, v16
+; GFX7-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_mul_f32_e32 v31, 1.0, v31
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
+; GFX7-NEXT:    v_fma_f32 v15, v15, v31, v32
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:60
+; GFX7-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_mul_f32_e32 v31, 1.0, v31
+; GFX7-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
+; GFX7-NEXT:    v_fma_f32 v14, v14, v30, v31
+; GFX7-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:56
+; GFX7-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_mul_f32_e32 v30, 1.0, v30
+; GFX7-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
+; GFX7-NEXT:    v_fma_f32 v13, v13, v29, v30
+; GFX7-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:52
+; GFX7-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_mul_f32_e32 v29, 1.0, v29
+; GFX7-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; GFX7-NEXT:    v_fma_f32 v12, v12, v28, v29
+; GFX7-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:48
+; GFX7-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_mul_f32_e32 v28, 1.0, v28
+; GFX7-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
+; GFX7-NEXT:    v_fma_f32 v11, v11, v27, v28
+; GFX7-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:44
+; GFX7-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_mul_f32_e32 v27, 1.0, v27
+; GFX7-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
+; GFX7-NEXT:    v_fma_f32 v10, v10, v26, v27
+; GFX7-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:40
+; GFX7-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_mul_f32_e32 v26, 1.0, v26
+; GFX7-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
+; GFX7-NEXT:    v_fma_f32 v9, v9, v25, v26
+; GFX7-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:36
+; GFX7-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_mul_f32_e32 v25, 1.0, v25
+; GFX7-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
+; GFX7-NEXT:    v_fma_f32 v8, v8, v24, v25
+; GFX7-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:32
+; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_mul_f32_e32 v24, 1.0, v24
+; GFX7-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; GFX7-NEXT:    v_fma_f32 v7, v7, v23, v24
+; GFX7-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:28
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_mul_f32_e32 v23, 1.0, v23
+; GFX7-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GFX7-NEXT:    v_fma_f32 v6, v6, v22, v23
+; GFX7-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:24
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_mul_f32_e32 v22, 1.0, v22
+; GFX7-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GFX7-NEXT:    v_fma_f32 v5, v5, v21, v22
+; GFX7-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:20
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_mul_f32_e32 v21, 1.0, v21
+; GFX7-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GFX7-NEXT:    v_fma_f32 v4, v4, v20, v21
+; GFX7-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:16
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_mul_f32_e32 v20, 1.0, v20
+; GFX7-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
+; GFX7-NEXT:    v_fma_f32 v3, v3, v19, v20
+; GFX7-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:12
+; GFX7-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:4
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_mul_f32_e32 v19, 1.0, v19
+; GFX7-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GFX7-NEXT:    v_fma_f32 v2, v2, v18, v19
+; GFX7-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:8
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_mul_f32_e32 v19, 1.0, v20
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_mul_f32_e32 v18, 1.0, v18
+; GFX7-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GFX7-NEXT:    v_fma_f32 v1, v1, v17, v18
+; GFX7-NEXT:    v_and_b32_e32 v17, 0xffff0000, v19
+; GFX7-NEXT:    v_fma_f32 v0, v0, v16, v17
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fma_v16bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v24, 16, v23
+; GFX8-NEXT:    v_lshlrev_b32_e32 v25, 16, v15
+; GFX8-NEXT:    v_lshlrev_b32_e32 v26, 16, v7
+; GFX8-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GFX8-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX8-NEXT:    v_fma_f32 v24, v26, v25, v24
+; GFX8-NEXT:    v_fma_f32 v7, v7, v15, v23
+; GFX8-NEXT:    v_lshlrev_b32_e32 v15, 16, v22
+; GFX8-NEXT:    v_lshlrev_b32_e32 v23, 16, v14
+; GFX8-NEXT:    v_lshlrev_b32_e32 v25, 16, v6
+; GFX8-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GFX8-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX8-NEXT:    v_fma_f32 v15, v25, v23, v15
+; GFX8-NEXT:    v_fma_f32 v6, v6, v14, v22
+; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v21
+; GFX8-NEXT:    v_lshlrev_b32_e32 v22, 16, v13
+; GFX8-NEXT:    v_lshlrev_b32_e32 v23, 16, v5
+; GFX8-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GFX8-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX8-NEXT:    v_fma_f32 v14, v23, v22, v14
+; GFX8-NEXT:    v_fma_f32 v5, v5, v13, v21
+; GFX8-NEXT:    v_lshlrev_b32_e32 v13, 16, v20
+; GFX8-NEXT:    v_lshlrev_b32_e32 v21, 16, v12
+; GFX8-NEXT:    v_lshlrev_b32_e32 v22, 16, v4
+; GFX8-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
+; GFX8-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX8-NEXT:    v_fma_f32 v13, v22, v21, v13
+; GFX8-NEXT:    v_fma_f32 v4, v4, v12, v20
+; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v19
+; GFX8-NEXT:    v_lshlrev_b32_e32 v20, 16, v11
+; GFX8-NEXT:    v_lshlrev_b32_e32 v21, 16, v3
+; GFX8-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GFX8-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX8-NEXT:    v_fma_f32 v12, v21, v20, v12
+; GFX8-NEXT:    v_fma_f32 v3, v3, v11, v19
+; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 16, v18
+; GFX8-NEXT:    v_lshlrev_b32_e32 v19, 16, v10
+; GFX8-NEXT:    v_lshlrev_b32_e32 v20, 16, v2
+; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GFX8-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT:    v_fma_f32 v11, v20, v19, v11
+; GFX8-NEXT:    v_fma_f32 v2, v2, v10, v18
+; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v17
+; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v9
+; GFX8-NEXT:    v_lshlrev_b32_e32 v19, 16, v1
+; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT:    v_fma_f32 v10, v19, v18, v10
+; GFX8-NEXT:    v_fma_f32 v1, v1, v9, v17
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v8
+; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v0
+; GFX8-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GFX8-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_fma_f32 v0, v0, v8, v16
+; GFX8-NEXT:    v_bfe_u32 v8, v24, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v24
+; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, s4, v8
+; GFX8-NEXT:    v_or_b32_e32 v16, 0x400000, v24
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v24, v24
+; GFX8-NEXT:    v_cndmask_b32_e32 v8, v8, v16, vcc
+; GFX8-NEXT:    v_bfe_u32 v16, v7, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v16, vcc, v16, v7
+; GFX8-NEXT:    v_add_u32_e32 v16, vcc, s4, v16
+; GFX8-NEXT:    v_fma_f32 v9, v18, v17, v9
+; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v7
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
+; GFX8-NEXT:    v_cndmask_b32_e32 v7, v16, v17, vcc
+; GFX8-NEXT:    v_bfe_u32 v16, v15, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v16, vcc, v16, v15
+; GFX8-NEXT:    v_add_u32_e32 v16, vcc, s4, v16
+; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v15
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v15, v15
+; GFX8-NEXT:    v_cndmask_b32_e32 v15, v16, v17, vcc
+; GFX8-NEXT:    v_bfe_u32 v16, v6, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v16, vcc, v16, v6
+; GFX8-NEXT:    v_add_u32_e32 v16, vcc, s4, v16
+; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v6
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v16, v17, vcc
+; GFX8-NEXT:    v_bfe_u32 v16, v14, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v16, vcc, v16, v14
+; GFX8-NEXT:    v_add_u32_e32 v16, vcc, s4, v16
+; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v14
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v14, v14
+; GFX8-NEXT:    v_cndmask_b32_e32 v14, v16, v17, vcc
+; GFX8-NEXT:    v_bfe_u32 v16, v5, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v16, vcc, v16, v5
+; GFX8-NEXT:    v_add_u32_e32 v16, vcc, s4, v16
+; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v5
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v16, v17, vcc
+; GFX8-NEXT:    v_bfe_u32 v16, v13, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v16, vcc, v16, v13
+; GFX8-NEXT:    v_add_u32_e32 v16, vcc, s4, v16
+; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v13
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
+; GFX8-NEXT:    v_cndmask_b32_e32 v13, v16, v17, vcc
+; GFX8-NEXT:    v_bfe_u32 v16, v4, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v16, vcc, v16, v4
+; GFX8-NEXT:    v_add_u32_e32 v16, vcc, s4, v16
+; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v4
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v16, v17, vcc
+; GFX8-NEXT:    v_bfe_u32 v16, v12, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v16, vcc, v16, v12
+; GFX8-NEXT:    v_add_u32_e32 v16, vcc, s4, v16
+; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v12
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v12, v12
+; GFX8-NEXT:    v_cndmask_b32_e32 v12, v16, v17, vcc
+; GFX8-NEXT:    v_bfe_u32 v16, v3, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v16, vcc, v16, v3
+; GFX8-NEXT:    v_add_u32_e32 v16, vcc, s4, v16
+; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v3
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v16, v17, vcc
+; GFX8-NEXT:    v_bfe_u32 v16, v11, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v16, vcc, v16, v11
+; GFX8-NEXT:    v_add_u32_e32 v16, vcc, s4, v16
+; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v11
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
+; GFX8-NEXT:    v_cndmask_b32_e32 v11, v16, v17, vcc
+; GFX8-NEXT:    v_bfe_u32 v16, v2, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v16, vcc, v16, v2
+; GFX8-NEXT:    v_add_u32_e32 v16, vcc, s4, v16
+; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v2
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v16, v17, vcc
+; GFX8-NEXT:    v_bfe_u32 v16, v10, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v16, vcc, v16, v10
+; GFX8-NEXT:    v_add_u32_e32 v16, vcc, s4, v16
+; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v10
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v10, v10
+; GFX8-NEXT:    v_cndmask_b32_e32 v10, v16, v17, vcc
+; GFX8-NEXT:    v_bfe_u32 v16, v1, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v16, vcc, v16, v1
+; GFX8-NEXT:    v_add_u32_e32 v16, vcc, s4, v16
+; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v1
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v16, v17, vcc
+; GFX8-NEXT:    v_bfe_u32 v16, v9, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v16, vcc, v16, v9
+; GFX8-NEXT:    v_add_u32_e32 v16, vcc, s4, v16
+; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v9
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
+; GFX8-NEXT:    v_cndmask_b32_e32 v9, v16, v17, vcc
+; GFX8-NEXT:    v_bfe_u32 v16, v0, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v16, vcc, v16, v0
+; GFX8-NEXT:    v_add_u32_e32 v16, vcc, s4, v16
+; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v0
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v16, v17, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_alignbit_b32 v0, v0, v9, 16
+; GFX8-NEXT:    v_alignbit_b32 v1, v1, v10, 16
+; GFX8-NEXT:    v_alignbit_b32 v2, v2, v11, 16
+; GFX8-NEXT:    v_alignbit_b32 v3, v3, v12, 16
+; GFX8-NEXT:    v_alignbit_b32 v4, v4, v13, 16
+; GFX8-NEXT:    v_alignbit_b32 v5, v5, v14, 16
+; GFX8-NEXT:    v_alignbit_b32 v6, v6, v15, 16
+; GFX8-NEXT:    v_alignbit_b32 v7, v7, v8, 16
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-LABEL: v_fma_v16bf16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v24, 16, v23
+; GFX900-NEXT:    v_lshlrev_b32_e32 v25, 16, v15
+; GFX900-NEXT:    v_lshlrev_b32_e32 v26, 16, v7
+; GFX900-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GFX900-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX900-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX900-NEXT:    v_fma_f32 v24, v26, v25, v24
+; GFX900-NEXT:    v_fma_f32 v7, v7, v15, v23
+; GFX900-NEXT:    v_lshlrev_b32_e32 v15, 16, v22
+; GFX900-NEXT:    v_lshlrev_b32_e32 v23, 16, v14
+; GFX900-NEXT:    v_lshlrev_b32_e32 v25, 16, v6
+; GFX900-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GFX900-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX900-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX900-NEXT:    v_fma_f32 v15, v25, v23, v15
+; GFX900-NEXT:    v_fma_f32 v6, v6, v14, v22
+; GFX900-NEXT:    v_lshlrev_b32_e32 v14, 16, v21
+; GFX900-NEXT:    v_lshlrev_b32_e32 v22, 16, v13
+; GFX900-NEXT:    v_lshlrev_b32_e32 v23, 16, v5
+; GFX900-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GFX900-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX900-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX900-NEXT:    v_fma_f32 v14, v23, v22, v14
+; GFX900-NEXT:    v_fma_f32 v5, v5, v13, v21
+; GFX900-NEXT:    v_lshlrev_b32_e32 v13, 16, v20
+; GFX900-NEXT:    v_lshlrev_b32_e32 v21, 16, v12
+; GFX900-NEXT:    v_lshlrev_b32_e32 v22, 16, v4
+; GFX900-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
+; GFX900-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX900-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX900-NEXT:    v_fma_f32 v13, v22, v21, v13
+; GFX900-NEXT:    v_fma_f32 v4, v4, v12, v20
+; GFX900-NEXT:    v_lshlrev_b32_e32 v12, 16, v19
+; GFX900-NEXT:    v_lshlrev_b32_e32 v20, 16, v11
+; GFX900-NEXT:    v_lshlrev_b32_e32 v21, 16, v3
+; GFX900-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GFX900-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX900-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX900-NEXT:    v_fma_f32 v12, v21, v20, v12
+; GFX900-NEXT:    v_fma_f32 v3, v3, v11, v19
+; GFX900-NEXT:    v_lshlrev_b32_e32 v11, 16, v18
+; GFX900-NEXT:    v_lshlrev_b32_e32 v19, 16, v10
+; GFX900-NEXT:    v_lshlrev_b32_e32 v20, 16, v2
+; GFX900-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GFX900-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX900-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX900-NEXT:    v_fma_f32 v11, v20, v19, v11
+; GFX900-NEXT:    v_fma_f32 v2, v2, v10, v18
+; GFX900-NEXT:    v_lshlrev_b32_e32 v10, 16, v17
+; GFX900-NEXT:    v_lshlrev_b32_e32 v18, 16, v9
+; GFX900-NEXT:    v_lshlrev_b32_e32 v19, 16, v1
+; GFX900-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; GFX900-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX900-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX900-NEXT:    v_fma_f32 v10, v19, v18, v10
+; GFX900-NEXT:    v_fma_f32 v1, v1, v9, v17
+; GFX900-NEXT:    v_lshlrev_b32_e32 v9, 16, v16
+; GFX900-NEXT:    v_lshlrev_b32_e32 v17, 16, v8
+; GFX900-NEXT:    v_lshlrev_b32_e32 v18, 16, v0
+; GFX900-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GFX900-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX900-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX900-NEXT:    v_fma_f32 v0, v0, v8, v16
+; GFX900-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX900-NEXT:    v_bfe_u32 v8, v24, 16, 1
+; GFX900-NEXT:    v_add3_u32 v8, v8, v24, s4
+; GFX900-NEXT:    v_or_b32_e32 v16, 0x400000, v24
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v24, v24
+; GFX900-NEXT:    v_cndmask_b32_e32 v8, v8, v16, vcc
+; GFX900-NEXT:    v_bfe_u32 v16, v7, 16, 1
+; GFX900-NEXT:    v_fma_f32 v9, v18, v17, v9
+; GFX900-NEXT:    v_add3_u32 v16, v16, v7, s4
+; GFX900-NEXT:    v_or_b32_e32 v17, 0x400000, v7
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
+; GFX900-NEXT:    v_cndmask_b32_e32 v7, v16, v17, vcc
+; GFX900-NEXT:    v_bfe_u32 v16, v15, 16, 1
+; GFX900-NEXT:    v_add3_u32 v16, v16, v15, s4
+; GFX900-NEXT:    v_or_b32_e32 v17, 0x400000, v15
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v15, v15
+; GFX900-NEXT:    v_cndmask_b32_e32 v15, v16, v17, vcc
+; GFX900-NEXT:    v_bfe_u32 v16, v6, 16, 1
+; GFX900-NEXT:    v_add3_u32 v16, v16, v6, s4
+; GFX900-NEXT:    v_or_b32_e32 v17, 0x400000, v6
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX900-NEXT:    v_cndmask_b32_e32 v6, v16, v17, vcc
+; GFX900-NEXT:    v_bfe_u32 v16, v14, 16, 1
+; GFX900-NEXT:    v_add3_u32 v16, v16, v14, s4
+; GFX900-NEXT:    v_or_b32_e32 v17, 0x400000, v14
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v14, v14
+; GFX900-NEXT:    v_cndmask_b32_e32 v14, v16, v17, vcc
+; GFX900-NEXT:    v_bfe_u32 v16, v5, 16, 1
+; GFX900-NEXT:    v_add3_u32 v16, v16, v5, s4
+; GFX900-NEXT:    v_or_b32_e32 v17, 0x400000, v5
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX900-NEXT:    v_cndmask_b32_e32 v5, v16, v17, vcc
+; GFX900-NEXT:    v_bfe_u32 v16, v13, 16, 1
+; GFX900-NEXT:    v_add3_u32 v16, v16, v13, s4
+; GFX900-NEXT:    v_or_b32_e32 v17, 0x400000, v13
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
+; GFX900-NEXT:    v_cndmask_b32_e32 v13, v16, v17, vcc
+; GFX900-NEXT:    v_bfe_u32 v16, v4, 16, 1
+; GFX900-NEXT:    v_add3_u32 v16, v16, v4, s4
+; GFX900-NEXT:    v_or_b32_e32 v17, 0x400000, v4
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
+; GFX900-NEXT:    v_cndmask_b32_e32 v4, v16, v17, vcc
+; GFX900-NEXT:    v_bfe_u32 v16, v12, 16, 1
+; GFX900-NEXT:    v_add3_u32 v16, v16, v12, s4
+; GFX900-NEXT:    v_or_b32_e32 v17, 0x400000, v12
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v12, v12
+; GFX900-NEXT:    v_cndmask_b32_e32 v12, v16, v17, vcc
+; GFX900-NEXT:    v_bfe_u32 v16, v3, 16, 1
+; GFX900-NEXT:    v_add3_u32 v16, v16, v3, s4
+; GFX900-NEXT:    v_or_b32_e32 v17, 0x400000, v3
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; GFX900-NEXT:    v_cndmask_b32_e32 v3, v16, v17, vcc
+; GFX900-NEXT:    v_bfe_u32 v16, v11, 16, 1
+; GFX900-NEXT:    v_add3_u32 v16, v16, v11, s4
+; GFX900-NEXT:    v_or_b32_e32 v17, 0x400000, v11
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
+; GFX900-NEXT:    v_cndmask_b32_e32 v11, v16, v17, vcc
+; GFX900-NEXT:    v_bfe_u32 v16, v2, 16, 1
+; GFX900-NEXT:    v_add3_u32 v16, v16, v2, s4
+; GFX900-NEXT:    v_or_b32_e32 v17, 0x400000, v2
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; GFX900-NEXT:    v_cndmask_b32_e32 v2, v16, v17, vcc
+; GFX900-NEXT:    v_bfe_u32 v16, v10, 16, 1
+; GFX900-NEXT:    v_add3_u32 v16, v16, v10, s4
+; GFX900-NEXT:    v_or_b32_e32 v17, 0x400000, v10
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v10, v10
+; GFX900-NEXT:    v_cndmask_b32_e32 v10, v16, v17, vcc
+; GFX900-NEXT:    v_bfe_u32 v16, v1, 16, 1
+; GFX900-NEXT:    v_add3_u32 v16, v16, v1, s4
+; GFX900-NEXT:    v_or_b32_e32 v17, 0x400000, v1
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v16, v17, vcc
+; GFX900-NEXT:    v_bfe_u32 v16, v9, 16, 1
+; GFX900-NEXT:    v_add3_u32 v16, v16, v9, s4
+; GFX900-NEXT:    v_or_b32_e32 v17, 0x400000, v9
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
+; GFX900-NEXT:    v_cndmask_b32_e32 v9, v16, v17, vcc
+; GFX900-NEXT:    v_bfe_u32 v16, v0, 16, 1
+; GFX900-NEXT:    v_add3_u32 v16, v16, v0, s4
+; GFX900-NEXT:    v_or_b32_e32 v17, 0x400000, v0
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v16, v17, vcc
+; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX900-NEXT:    v_perm_b32 v0, v0, v9, s4
+; GFX900-NEXT:    v_perm_b32 v1, v1, v10, s4
+; GFX900-NEXT:    v_perm_b32 v2, v2, v11, s4
+; GFX900-NEXT:    v_perm_b32 v3, v3, v12, s4
+; GFX900-NEXT:    v_perm_b32 v4, v4, v13, s4
+; GFX900-NEXT:    v_perm_b32 v5, v5, v14, s4
+; GFX900-NEXT:    v_perm_b32 v6, v6, v15, s4
+; GFX900-NEXT:    v_perm_b32 v7, v7, v8, s4
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fma_v16bf16:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_and_b32_e32 v24, 0xffff0000, v23
+; GFX950-NEXT:    v_and_b32_e32 v25, 0xffff0000, v15
+; GFX950-NEXT:    v_and_b32_e32 v26, 0xffff0000, v7
+; GFX950-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
+; GFX950-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GFX950-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX950-NEXT:    v_fmac_f32_e32 v24, v26, v25
+; GFX950-NEXT:    v_fmac_f32_e32 v23, v7, v15
+; GFX950-NEXT:    v_and_b32_e32 v7, 0xffff0000, v22
+; GFX950-NEXT:    v_and_b32_e32 v15, 0xffff0000, v14
+; GFX950-NEXT:    v_and_b32_e32 v25, 0xffff0000, v6
+; GFX950-NEXT:    v_fmac_f32_e32 v7, v25, v15
+; GFX950-NEXT:    v_lshlrev_b32_e32 v15, 16, v22
+; GFX950-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX950-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX950-NEXT:    v_fmac_f32_e32 v15, v6, v14
+; GFX950-NEXT:    v_and_b32_e32 v6, 0xffff0000, v21
+; GFX950-NEXT:    v_and_b32_e32 v14, 0xffff0000, v13
+; GFX950-NEXT:    v_and_b32_e32 v22, 0xffff0000, v5
+; GFX950-NEXT:    v_fmac_f32_e32 v6, v22, v14
+; GFX950-NEXT:    v_lshlrev_b32_e32 v14, 16, v21
+; GFX950-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX950-NEXT:    v_fmac_f32_e32 v14, v5, v13
+; GFX950-NEXT:    v_and_b32_e32 v5, 0xffff0000, v20
+; GFX950-NEXT:    v_and_b32_e32 v13, 0xffff0000, v12
+; GFX950-NEXT:    v_and_b32_e32 v21, 0xffff0000, v4
+; GFX950-NEXT:    v_fmac_f32_e32 v5, v21, v13
+; GFX950-NEXT:    v_lshlrev_b32_e32 v13, 16, v20
+; GFX950-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX950-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX950-NEXT:    v_fmac_f32_e32 v13, v4, v12
+; GFX950-NEXT:    v_and_b32_e32 v4, 0xffff0000, v19
+; GFX950-NEXT:    v_and_b32_e32 v12, 0xffff0000, v11
+; GFX950-NEXT:    v_and_b32_e32 v20, 0xffff0000, v3
+; GFX950-NEXT:    v_fmac_f32_e32 v4, v20, v12
+; GFX950-NEXT:    v_lshlrev_b32_e32 v12, 16, v19
+; GFX950-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX950-NEXT:    v_fmac_f32_e32 v12, v3, v11
+; GFX950-NEXT:    v_and_b32_e32 v3, 0xffff0000, v18
+; GFX950-NEXT:    v_and_b32_e32 v11, 0xffff0000, v10
+; GFX950-NEXT:    v_and_b32_e32 v19, 0xffff0000, v2
+; GFX950-NEXT:    v_fmac_f32_e32 v3, v19, v11
+; GFX950-NEXT:    v_lshlrev_b32_e32 v11, 16, v18
+; GFX950-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX950-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX950-NEXT:    v_fmac_f32_e32 v11, v2, v10
+; GFX950-NEXT:    v_and_b32_e32 v2, 0xffff0000, v17
+; GFX950-NEXT:    v_and_b32_e32 v10, 0xffff0000, v9
+; GFX950-NEXT:    v_and_b32_e32 v18, 0xffff0000, v1
+; GFX950-NEXT:    v_fmac_f32_e32 v2, v18, v10
+; GFX950-NEXT:    v_lshlrev_b32_e32 v10, 16, v17
+; GFX950-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX950-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX950-NEXT:    v_fmac_f32_e32 v10, v1, v9
+; GFX950-NEXT:    v_and_b32_e32 v1, 0xffff0000, v16
+; GFX950-NEXT:    v_and_b32_e32 v9, 0xffff0000, v8
+; GFX950-NEXT:    v_and_b32_e32 v17, 0xffff0000, v0
+; GFX950-NEXT:    v_fmac_f32_e32 v1, v17, v9
+; GFX950-NEXT:    v_lshlrev_b32_e32 v9, 16, v16
+; GFX950-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX950-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT:    v_fmac_f32_e32 v9, v0, v8
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v0, v9, v1
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v1, v10, v2
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v2, v11, v3
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v3, v12, v4
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v4, v13, v5
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v5, v14, v6
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v6, v15, v7
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v7, v23, v24
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fma_v16bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v24, 16, v23
+; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v15
+; GFX10-NEXT:    v_lshlrev_b32_e32 v26, 16, v7
+; GFX10-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GFX10-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX10-NEXT:    v_fmac_f32_e32 v24, v26, v25
+; GFX10-NEXT:    v_lshlrev_b32_e32 v26, 16, v6
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX10-NEXT:    v_fmac_f32_e32 v23, v7, v15
+; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v22
+; GFX10-NEXT:    v_bfe_u32 v25, v24, 16, 1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v14
+; GFX10-NEXT:    v_or_b32_e32 v27, 0x400000, v24
+; GFX10-NEXT:    v_bfe_u32 v28, v23, 16, 1
+; GFX10-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX10-NEXT:    v_add3_u32 v25, v25, v24, 0x7fff
+; GFX10-NEXT:    v_fmac_f32_e32 v7, v26, v15
+; GFX10-NEXT:    v_and_b32_e32 v15, 0xffff0000, v22
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX10-NEXT:    v_add3_u32 v24, v28, v23, 0x7fff
+; GFX10-NEXT:    v_bfe_u32 v26, v7, 16, 1
+; GFX10-NEXT:    v_fmac_f32_e32 v15, v6, v14
+; GFX10-NEXT:    v_cndmask_b32_e32 v22, v25, v27, vcc_lo
+; GFX10-NEXT:    v_or_b32_e32 v25, 0x400000, v23
+; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v21
+; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v13
+; GFX10-NEXT:    v_lshlrev_b32_e32 v27, 16, v5
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
+; GFX10-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX10-NEXT:    v_fmac_f32_e32 v6, v27, v14
+; GFX10-NEXT:    v_cndmask_b32_e32 v23, v24, v25, vcc_lo
+; GFX10-NEXT:    v_add3_u32 v24, v26, v7, 0x7fff
+; GFX10-NEXT:    v_or_b32_e32 v25, 0x400000, v7
+; GFX10-NEXT:    v_bfe_u32 v26, v15, 16, 1
+; GFX10-NEXT:    v_and_b32_e32 v14, 0xffff0000, v21
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX10-NEXT:    v_add3_u32 v21, v26, v15, 0x7fff
+; GFX10-NEXT:    v_fmac_f32_e32 v14, v5, v13
+; GFX10-NEXT:    v_cndmask_b32_e32 v7, v24, v25, vcc_lo
+; GFX10-NEXT:    v_or_b32_e32 v24, 0x400000, v15
+; GFX10-NEXT:    v_bfe_u32 v25, v6, 16, 1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v20
+; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v12
+; GFX10-NEXT:    v_lshlrev_b32_e32 v26, 16, v4
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX10-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX10-NEXT:    v_fmac_f32_e32 v5, v26, v13
+; GFX10-NEXT:    v_cndmask_b32_e32 v15, v21, v24, vcc_lo
+; GFX10-NEXT:    v_add3_u32 v21, v25, v6, 0x7fff
+; GFX10-NEXT:    v_or_b32_e32 v24, 0x400000, v6
+; GFX10-NEXT:    v_bfe_u32 v25, v14, 16, 1
+; GFX10-NEXT:    v_and_b32_e32 v13, 0xffff0000, v20
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX10-NEXT:    v_lshlrev_b32_e32 v26, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX10-NEXT:    v_add3_u32 v20, v25, v14, 0x7fff
+; GFX10-NEXT:    v_fmac_f32_e32 v13, v4, v12
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v21, v24, vcc_lo
+; GFX10-NEXT:    v_or_b32_e32 v21, 0x400000, v14
+; GFX10-NEXT:    v_bfe_u32 v24, v5, 16, 1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v19
+; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v11
+; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v3
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX10-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX10-NEXT:    v_fmac_f32_e32 v4, v25, v12
+; GFX10-NEXT:    v_cndmask_b32_e32 v14, v20, v21, vcc_lo
+; GFX10-NEXT:    v_add3_u32 v20, v24, v5, 0x7fff
+; GFX10-NEXT:    v_or_b32_e32 v21, 0x400000, v5
+; GFX10-NEXT:    v_and_b32_e32 v12, 0xffff0000, v19
+; GFX10-NEXT:    v_lshlrev_b32_e32 v19, 16, v18
+; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v10
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX10-NEXT:    v_bfe_u32 v24, v13, 16, 1
+; GFX10-NEXT:    v_fmac_f32_e32 v12, v3, v11
+; GFX10-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GFX10-NEXT:    v_fmac_f32_e32 v19, v26, v25
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v20, v21, vcc_lo
+; GFX10-NEXT:    v_bfe_u32 v20, v4, 16, 1
+; GFX10-NEXT:    v_add3_u32 v21, v24, v13, 0x7fff
+; GFX10-NEXT:    v_bfe_u32 v24, v12, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v25, v19, 16, 1
+; GFX10-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX10-NEXT:    v_add3_u32 v11, v20, v4, 0x7fff
+; GFX10-NEXT:    v_or_b32_e32 v20, 0x400000, v4
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX10-NEXT:    v_or_b32_e32 v26, 0x400000, v19
+; GFX10-NEXT:    v_fmac_f32_e32 v18, v2, v10
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v17
+; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v9
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v11, v20, vcc_lo
+; GFX10-NEXT:    v_add3_u32 v11, v24, v12, 0x7fff
+; GFX10-NEXT:    v_or_b32_e32 v20, 0x400000, v12
+; GFX10-NEXT:    v_add3_u32 v24, v25, v19, 0x7fff
+; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v1
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX10-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; GFX10-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX10-NEXT:    v_fmac_f32_e32 v2, v25, v10
+; GFX10-NEXT:    v_cndmask_b32_e32 v11, v11, v20, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v8
+; GFX10-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX10-NEXT:    v_bfe_u32 v20, v2, 16, 1
+; GFX10-NEXT:    v_fmac_f32_e32 v17, v1, v9
+; GFX10-NEXT:    v_cndmask_b32_e32 v10, v24, v26, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v24, 16, v16
+; GFX10-NEXT:    v_lshlrev_b32_e32 v26, 16, v0
+; GFX10-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_add3_u32 v1, v20, v2, 0x7fff
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX10-NEXT:    v_fmac_f32_e32 v24, v26, v25
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX10-NEXT:    v_fmac_f32_e32 v16, v0, v8
+; GFX10-NEXT:    v_bfe_u32 v0, v17, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v27, v18, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v8, v24, 16, 1
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc_lo
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v17
+; GFX10-NEXT:    v_add3_u32 v0, v0, v17, 0x7fff
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX10-NEXT:    v_bfe_u32 v2, v16, 16, 1
+; GFX10-NEXT:    v_add3_u32 v8, v8, v24, 0x7fff
+; GFX10-NEXT:    v_or_b32_e32 v20, 0x400000, v24
+; GFX10-NEXT:    v_or_b32_e32 v25, 0x400000, v16
+; GFX10-NEXT:    v_cndmask_b32_e32 v9, v0, v9, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX10-NEXT:    v_add3_u32 v2, v2, v16, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v12, v27, v18, 0x7fff
+; GFX10-NEXT:    v_or_b32_e32 v19, 0x400000, v18
+; GFX10-NEXT:    v_or_b32_e32 v3, 0x400000, v13
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v8, v20, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX10-NEXT:    v_perm_b32 v1, v9, v1, 0x7060302
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v25, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX10-NEXT:    v_perm_b32 v0, v2, v0, 0x7060302
+; GFX10-NEXT:    v_cndmask_b32_e32 v8, v12, v19, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX10-NEXT:    v_perm_b32 v2, v8, v10, 0x7060302
+; GFX10-NEXT:    v_cndmask_b32_e32 v12, v21, v3, vcc_lo
+; GFX10-NEXT:    v_perm_b32 v3, v11, v4, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v4, v12, v5, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v5, v14, v6, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v6, v15, v7, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v7, v23, v22, 0x7060302
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11TRUE16-LABEL: v_fma_v16bf16:
+; GFX11TRUE16:       ; %bb.0:
+; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v24, 0xffff0000, v23
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v26, 0xffff0000, v7
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v27, 0xffff0000, v14
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v28, 0xffff0000, v6
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v25, 0xffff0000, v15
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11TRUE16-NEXT:    v_dual_fmac_f32 v24, v26, v25 :: v_dual_lshlrev_b32 v7, 16, v7
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v26, 0xffff0000, v22
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
+; GFX11TRUE16-NEXT:    v_bfe_u32 v25, v24, 16, 1
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11TRUE16-NEXT:    v_fmac_f32_e32 v26, v28, v27
+; GFX11TRUE16-NEXT:    v_fmac_f32_e32 v22, v6, v14
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v14, 0xffff0000, v21
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v28, 0xffff0000, v13
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GFX11TRUE16-NEXT:    v_add3_u32 v25, v25, v24, 0x7fff
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v29, 0x400000, v24
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX11TRUE16-NEXT:    v_fmac_f32_e32 v23, v7, v15
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v7, v25, v29, vcc_lo
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v29, 0xffff0000, v5
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11TRUE16-NEXT:    v_bfe_u32 v15, v23, 16, 1
+; GFX11TRUE16-NEXT:    v_bfe_u32 v24, v26, 16, 1
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v25, 0x400000, v23
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v27, 0x400000, v26
+; GFX11TRUE16-NEXT:    v_add3_u32 v15, v15, v23, 0x7fff
+; GFX11TRUE16-NEXT:    v_add3_u32 v24, v24, v26, 0x7fff
+; GFX11TRUE16-NEXT:    v_bfe_u32 v23, v22, 16, 1
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11TRUE16-NEXT:    v_dual_fmac_f32 v14, v29, v28 :: v_dual_cndmask_b32 v15, v15, v25
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v26, v26
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v25, 0xffff0000, v12
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v26, 0xffff0000, v4
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v6, v24, v27, vcc_lo
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v24, 0xffff0000, v20
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
+; GFX11TRUE16-NEXT:    v_add3_u32 v23, v23, v22, 0x7fff
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v27, 0x400000, v22
+; GFX11TRUE16-NEXT:    v_bfe_u32 v28, v14, 16, 1
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
+; GFX11TRUE16-NEXT:    v_fmac_f32_e32 v20, v4, v12
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff0000, v19
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v11
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX11TRUE16-NEXT:    v_fmac_f32_e32 v24, v26, v25
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v22, 0x400000, v14
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v7.l, v15.h
+; GFX11TRUE16-NEXT:    v_fmac_f32_e32 v21, v5, v13
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v13, v23, v27, vcc_lo
+; GFX11TRUE16-NEXT:    v_add3_u32 v5, v28, v14, 0x7fff
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX11TRUE16-NEXT:    v_bfe_u32 v25, v24, 16, 1
+; GFX11TRUE16-NEXT:    v_bfe_u32 v23, v21, 16, 1
+; GFX11TRUE16-NEXT:    v_bfe_u32 v27, v20, 16, 1
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v26, 0x400000, v24
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v5, v5, v22, vcc_lo
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v22, 0x400000, v21
+; GFX11TRUE16-NEXT:    v_add3_u32 v14, v23, v21, 0x7fff
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v21, v21
+; GFX11TRUE16-NEXT:    v_add3_u32 v23, v25, v24, 0x7fff
+; GFX11TRUE16-NEXT:    v_add3_u32 v21, v27, v20, 0x7fff
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v6.l, v13.h
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v14, v14, v22, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v22, 0x400000, v20
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v25, 0xffff0000, v3
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v24, 0xffff0000, v18
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v5.l, v14.h
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11TRUE16-NEXT:    v_fmac_f32_e32 v12, v25, v4
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v4, v23, v26, vcc_lo
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v25, 0xffff0000, v10
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v26, 0xffff0000, v2
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
+; GFX11TRUE16-NEXT:    v_bfe_u32 v23, v12, 16, 1
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX11TRUE16-NEXT:    v_fmac_f32_e32 v24, v26, v25
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11TRUE16-NEXT:    v_dual_cndmask_b32 v20, v21, v22 :: v_dual_and_b32 v25, 0xffff0000, v1
+; GFX11TRUE16-NEXT:    v_add3_u32 v21, v23, v12, 0x7fff
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11TRUE16-NEXT:    v_bfe_u32 v23, v24, 16, 1
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v22, 0x400000, v12
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v4.l, v20.h
+; GFX11TRUE16-NEXT:    v_add3_u32 v12, v23, v24, 0x7fff
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v23, 0xffff0000, v9
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11TRUE16-NEXT:    v_fmac_f32_e32 v19, v3, v11
+; GFX11TRUE16-NEXT:    v_dual_cndmask_b32 v3, v21, v22 :: v_dual_and_b32 v22, 0xffff0000, v17
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v18
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
+; GFX11TRUE16-NEXT:    v_bfe_u32 v18, v19, 16, 1
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11TRUE16-NEXT:    v_dual_fmac_f32 v22, v25, v23 :: v_dual_fmac_f32 v11, v2, v10
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v10, 0x400000, v19
+; GFX11TRUE16-NEXT:    v_add3_u32 v2, v18, v19, 0x7fff
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v18, 0x400000, v24
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v8
+; GFX11TRUE16-NEXT:    v_bfe_u32 v21, v11, 16, 1
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11TRUE16-NEXT:    v_dual_fmac_f32 v17, v1, v9 :: v_dual_cndmask_b32 v10, v2, v10
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v0
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff0000, v16
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff0000, v8
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v2, v12, v18, vcc_lo
+; GFX11TRUE16-NEXT:    v_add3_u32 v12, v21, v11, 0x7fff
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v16
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v18, 0x400000, v11
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11TRUE16-NEXT:    v_bfe_u32 v11, v17, 16, 1
+; GFX11TRUE16-NEXT:    v_bfe_u32 v19, v22, 16, 1
+; GFX11TRUE16-NEXT:    v_fmac_f32_e32 v21, v24, v23
+; GFX11TRUE16-NEXT:    v_fmac_f32_e32 v9, v0, v1
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v8, v12, v18, vcc_lo
+; GFX11TRUE16-NEXT:    v_add3_u32 v11, v11, v17, 0x7fff
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v16, 0x400000, v17
+; GFX11TRUE16-NEXT:    v_bfe_u32 v0, v21, 16, 1
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11TRUE16-NEXT:    v_add3_u32 v12, v19, v22, 0x7fff
+; GFX11TRUE16-NEXT:    v_bfe_u32 v18, v9, 16, 1
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v19, 0x400000, v21
+; GFX11TRUE16-NEXT:    v_add3_u32 v0, v0, v21, 0x7fff
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v11, v11, v16, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v21, v21
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v1, 0x400000, v22
+; GFX11TRUE16-NEXT:    v_add3_u32 v16, v18, v9, 0x7fff
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v17, 0x400000, v9
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v2.l, v8.h
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v18, v0, v19, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v3.l, v10.h
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v1, v12, v1, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v1.l, v11.h
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v0, v16, v17, vcc_lo
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v0.l, v18.h
+; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11FAKE16-LABEL: v_fma_v16bf16:
+; GFX11FAKE16:       ; %bb.0:
+; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v23
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v15
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v7
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11FAKE16-NEXT:    v_dual_fmac_f32 v24, v26, v25 :: v_dual_and_b32 v23, 0xffff0000, v23
+; GFX11FAKE16-NEXT:    v_dual_fmac_f32 v23, v7, v15 :: v_dual_lshlrev_b32 v26, 16, v6
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v14
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11FAKE16-NEXT:    v_bfe_u32 v25, v24, 16, 1
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v27, 0x400000, v24
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX11FAKE16-NEXT:    v_bfe_u32 v28, v23, 16, 1
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX11FAKE16-NEXT:    v_add3_u32 v25, v25, v24, 0x7fff
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v22
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11FAKE16-NEXT:    v_add3_u32 v24, v28, v23, 0x7fff
+; GFX11FAKE16-NEXT:    v_fmac_f32_e32 v7, v26, v15
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11FAKE16-NEXT:    v_dual_cndmask_b32 v22, v25, v27 :: v_dual_and_b32 v15, 0xffff0000, v22
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v25, 0x400000, v23
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
+; GFX11FAKE16-NEXT:    v_bfe_u32 v26, v7, 16, 1
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v5
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v23, v24, v25, vcc_lo
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11FAKE16-NEXT:    v_add3_u32 v24, v26, v7, 0x7fff
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v25, 0x400000, v7
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11FAKE16-NEXT:    v_dual_cndmask_b32 v7, v24, v25 :: v_dual_and_b32 v6, 0xffff0000, v6
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11FAKE16-NEXT:    v_dual_fmac_f32 v15, v6, v14 :: v_dual_lshlrev_b32 v14, 16, v13
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v24, 0x400000, v15
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v21
+; GFX11FAKE16-NEXT:    v_bfe_u32 v26, v15, 16, 1
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11FAKE16-NEXT:    v_fmac_f32_e32 v6, v27, v14
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v14, 0xffff0000, v21
+; GFX11FAKE16-NEXT:    v_add3_u32 v21, v26, v15, 0x7fff
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v4
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11FAKE16-NEXT:    v_bfe_u32 v25, v6, 16, 1
+; GFX11FAKE16-NEXT:    v_dual_fmac_f32 v14, v5, v13 :: v_dual_lshlrev_b32 v5, 16, v20
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v12
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v15, v21, v24, vcc_lo
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11FAKE16-NEXT:    v_add3_u32 v21, v25, v6, 0x7fff
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v24, 0x400000, v6
+; GFX11FAKE16-NEXT:    v_bfe_u32 v25, v14, 16, 1
+; GFX11FAKE16-NEXT:    v_dual_fmac_f32 v5, v26, v13 :: v_dual_and_b32 v12, 0xffff0000, v12
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v13, 0xffff0000, v20
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v2
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX11FAKE16-NEXT:    v_add3_u32 v20, v25, v14, 0x7fff
+; GFX11FAKE16-NEXT:    v_dual_cndmask_b32 v6, v21, v24 :: v_dual_lshlrev_b32 v25, 16, v3
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v21, 0x400000, v14
+; GFX11FAKE16-NEXT:    v_bfe_u32 v24, v5, 16, 1
+; GFX11FAKE16-NEXT:    v_dual_fmac_f32 v13, v4, v12 :: v_dual_lshlrev_b32 v4, 16, v19
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v11
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11FAKE16-NEXT:    v_fmac_f32_e32 v4, v25, v12
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v14, v20, v21, vcc_lo
+; GFX11FAKE16-NEXT:    v_add3_u32 v20, v24, v5, 0x7fff
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v21, 0x400000, v5
+; GFX11FAKE16-NEXT:    v_bfe_u32 v24, v13, 16, 1
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v12, 0xffff0000, v19
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v10
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11FAKE16-NEXT:    v_dual_fmac_f32 v12, v3, v11 :: v_dual_cndmask_b32 v5, v20, v21
+; GFX11FAKE16-NEXT:    v_add3_u32 v21, v24, v13, 0x7fff
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v3, 0x400000, v13
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v18
+; GFX11FAKE16-NEXT:    v_bfe_u32 v20, v4, 16, 1
+; GFX11FAKE16-NEXT:    v_bfe_u32 v24, v12, 16, 1
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11FAKE16-NEXT:    v_fmac_f32_e32 v19, v26, v25
+; GFX11FAKE16-NEXT:    v_add3_u32 v11, v20, v4, 0x7fff
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v20, 0x400000, v4
+; GFX11FAKE16-NEXT:    v_fmac_f32_e32 v18, v2, v10
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v17
+; GFX11FAKE16-NEXT:    v_bfe_u32 v25, v19, 16, 1
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v9
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v4, v11, v20, vcc_lo
+; GFX11FAKE16-NEXT:    v_add3_u32 v11, v24, v12, 0x7fff
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v20, 0x400000, v12
+; GFX11FAKE16-NEXT:    v_add3_u32 v24, v25, v19, 0x7fff
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v1
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v26, 0x400000, v19
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11FAKE16-NEXT:    v_dual_fmac_f32 v2, v25, v10 :: v_dual_and_b32 v9, 0xffff0000, v9
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v11, v11, v20, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v8
+; GFX11FAKE16-NEXT:    v_bfe_u32 v20, v2, 16, 1
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v10, v24, v26, vcc_lo
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v16
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v0
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GFX11FAKE16-NEXT:    v_dual_fmac_f32 v17, v1, v9 :: v_dual_and_b32 v0, 0xffff0000, v0
+; GFX11FAKE16-NEXT:    v_add3_u32 v1, v20, v2, 0x7fff
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11FAKE16-NEXT:    v_fmac_f32_e32 v24, v26, v25
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX11FAKE16-NEXT:    v_fmac_f32_e32 v16, v0, v8
+; GFX11FAKE16-NEXT:    v_bfe_u32 v0, v17, 16, 1
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11FAKE16-NEXT:    v_bfe_u32 v8, v24, 16, 1
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v20, 0x400000, v24
+; GFX11FAKE16-NEXT:    v_bfe_u32 v2, v16, 16, 1
+; GFX11FAKE16-NEXT:    v_add3_u32 v0, v0, v17, 0x7fff
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc_lo
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v17
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11FAKE16-NEXT:    v_add3_u32 v8, v8, v24, 0x7fff
+; GFX11FAKE16-NEXT:    v_bfe_u32 v27, v18, 16, 1
+; GFX11FAKE16-NEXT:    v_add3_u32 v2, v2, v16, 0x7fff
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v25, 0x400000, v16
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v9, v0, v9, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX11FAKE16-NEXT:    v_add3_u32 v12, v27, v18, 0x7fff
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v19, 0x400000, v18
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX11FAKE16-NEXT:    v_perm_b32 v1, v9, v1, 0x7060302
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v0, v8, v20, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v25, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX11FAKE16-NEXT:    v_perm_b32 v0, v2, v0, 0x7060302
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v8, v12, v19, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11FAKE16-NEXT:    v_perm_b32 v2, v8, v10, 0x7060302
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v12, v21, v3, vcc_lo
+; GFX11FAKE16-NEXT:    v_perm_b32 v3, v11, v4, 0x7060302
+; GFX11FAKE16-NEXT:    v_perm_b32 v4, v12, v5, 0x7060302
+; GFX11FAKE16-NEXT:    v_perm_b32 v5, v14, v6, 0x7060302
+; GFX11FAKE16-NEXT:    v_perm_b32 v6, v15, v7, 0x7060302
+; GFX11FAKE16-NEXT:    v_perm_b32 v7, v23, v22, 0x7060302
+; GFX11FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX1250-LABEL: v_fma_v16bf16:
 ; GFX1250:       ; %bb.0:
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -50018,67 +51864,2797 @@ define <8 x bfloat> @v_fma_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b, <8 x bfloat>
 ; GFX1250-NEXT:    v_pk_fma_bf16 v6, v6, v14, v22
 ; GFX1250-NEXT:    v_pk_fma_bf16 v7, v7, v15, v23
 ; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
-define <16 x bfloat> @v_fma_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b, <16 x bfloat> %c) {
   %op = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> %a, <16 x bfloat> %b, <16 x bfloat> %c)
   ret <16 x bfloat> %op
 }
 
+define <32 x bfloat> @v_fma_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b, <32 x bfloat> %c) {
+; GCN-LABEL: v_fma_v32bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:128
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:256
+; GCN-NEXT:    s_waitcnt vmcnt(2)
+; GCN-NEXT:    v_mul_f32_e32 v31, 1.0, v31
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GCN-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GCN-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
+; GCN-NEXT:    v_fma_f32 v31, v31, v32, v33
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:124
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:252
+; GCN-NEXT:    v_mul_f32_e32 v30, 1.0, v30
+; GCN-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GCN-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GCN-NEXT:    v_fma_f32 v30, v30, v32, v33
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:120
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:248
+; GCN-NEXT:    v_mul_f32_e32 v29, 1.0, v29
+; GCN-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GCN-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GCN-NEXT:    v_fma_f32 v29, v29, v32, v33
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:116
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:244
+; GCN-NEXT:    v_mul_f32_e32 v28, 1.0, v28
+; GCN-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GCN-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GCN-NEXT:    v_fma_f32 v28, v28, v32, v33
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:112
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:240
+; GCN-NEXT:    v_mul_f32_e32 v27, 1.0, v27
+; GCN-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GCN-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GCN-NEXT:    v_fma_f32 v27, v27, v32, v33
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:108
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:236
+; GCN-NEXT:    v_mul_f32_e32 v26, 1.0, v26
+; GCN-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GCN-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GCN-NEXT:    v_fma_f32 v26, v26, v32, v33
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:104
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:232
+; GCN-NEXT:    v_mul_f32_e32 v25, 1.0, v25
+; GCN-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GCN-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GCN-NEXT:    v_fma_f32 v25, v25, v32, v33
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:100
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:228
+; GCN-NEXT:    v_mul_f32_e32 v24, 1.0, v24
+; GCN-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GCN-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GCN-NEXT:    v_fma_f32 v24, v24, v32, v33
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:96
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:224
+; GCN-NEXT:    v_mul_f32_e32 v23, 1.0, v23
+; GCN-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GCN-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GCN-NEXT:    v_fma_f32 v23, v23, v32, v33
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:92
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:220
+; GCN-NEXT:    v_mul_f32_e32 v22, 1.0, v22
+; GCN-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GCN-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GCN-NEXT:    v_fma_f32 v22, v22, v32, v33
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:88
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:216
+; GCN-NEXT:    v_mul_f32_e32 v21, 1.0, v21
+; GCN-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GCN-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GCN-NEXT:    v_fma_f32 v21, v21, v32, v33
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:84
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:212
+; GCN-NEXT:    v_mul_f32_e32 v20, 1.0, v20
+; GCN-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GCN-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GCN-NEXT:    v_fma_f32 v20, v20, v32, v33
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:80
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:208
+; GCN-NEXT:    v_mul_f32_e32 v19, 1.0, v19
+; GCN-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GCN-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GCN-NEXT:    v_fma_f32 v19, v19, v32, v33
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:76
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:204
+; GCN-NEXT:    v_mul_f32_e32 v18, 1.0, v18
+; GCN-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GCN-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GCN-NEXT:    v_fma_f32 v18, v18, v32, v33
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:72
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:200
+; GCN-NEXT:    v_mul_f32_e32 v17, 1.0, v17
+; GCN-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GCN-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GCN-NEXT:    v_fma_f32 v17, v17, v32, v33
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:68
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:196
+; GCN-NEXT:    v_mul_f32_e32 v16, 1.0, v16
+; GCN-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GCN-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GCN-NEXT:    v_fma_f32 v16, v16, v32, v33
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:64
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:192
+; GCN-NEXT:    v_mul_f32_e32 v15, 1.0, v15
+; GCN-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GCN-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GCN-NEXT:    v_fma_f32 v15, v15, v32, v33
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:60
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:188
+; GCN-NEXT:    v_mul_f32_e32 v14, 1.0, v14
+; GCN-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GCN-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GCN-NEXT:    v_fma_f32 v14, v14, v32, v33
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:56
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:184
+; GCN-NEXT:    v_mul_f32_e32 v13, 1.0, v13
+; GCN-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GCN-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GCN-NEXT:    v_fma_f32 v13, v13, v32, v33
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:52
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:180
+; GCN-NEXT:    v_mul_f32_e32 v12, 1.0, v12
+; GCN-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GCN-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GCN-NEXT:    v_fma_f32 v12, v12, v32, v33
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:48
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:176
+; GCN-NEXT:    v_mul_f32_e32 v11, 1.0, v11
+; GCN-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GCN-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GCN-NEXT:    v_fma_f32 v11, v11, v32, v33
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:44
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:172
+; GCN-NEXT:    v_mul_f32_e32 v10, 1.0, v10
+; GCN-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GCN-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GCN-NEXT:    v_fma_f32 v10, v10, v32, v33
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:40
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:168
+; GCN-NEXT:    v_mul_f32_e32 v9, 1.0, v9
+; GCN-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GCN-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GCN-NEXT:    v_fma_f32 v9, v9, v32, v33
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:36
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:164
+; GCN-NEXT:    v_mul_f32_e32 v8, 1.0, v8
+; GCN-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GCN-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GCN-NEXT:    v_fma_f32 v8, v8, v32, v33
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:160
+; GCN-NEXT:    v_mul_f32_e32 v7, 1.0, v7
+; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GCN-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GCN-NEXT:    v_fma_f32 v7, v7, v32, v33
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:28
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:156
+; GCN-NEXT:    v_mul_f32_e32 v6, 1.0, v6
+; GCN-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GCN-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GCN-NEXT:    v_fma_f32 v6, v6, v32, v33
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:24
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:152
+; GCN-NEXT:    v_mul_f32_e32 v5, 1.0, v5
+; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GCN-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GCN-NEXT:    v_fma_f32 v5, v5, v32, v33
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:20
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:148
+; GCN-NEXT:    v_mul_f32_e32 v4, 1.0, v4
+; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GCN-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GCN-NEXT:    v_fma_f32 v4, v4, v32, v33
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:16
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:144
+; GCN-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GCN-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GCN-NEXT:    v_fma_f32 v3, v3, v32, v33
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:12
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:140
+; GCN-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GCN-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GCN-NEXT:    v_fma_f32 v2, v2, v32, v33
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:8
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:136
+; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GCN-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GCN-NEXT:    v_fma_f32 v1, v1, v32, v33
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:132
+; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GCN-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GCN-NEXT:    v_fma_f32 v0, v0, v32, v33
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GCN-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GCN-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GCN-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GCN-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GCN-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GCN-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GCN-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GCN-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GCN-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GCN-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; GCN-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GCN-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GCN-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
+; GCN-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GCN-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GCN-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GCN-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; GCN-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
+; GCN-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
+; GCN-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
+; GCN-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
+; GCN-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; GCN-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
+; GCN-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fma_v32bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:128
+; GFX7-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:256
+; GFX7-NEXT:    v_mul_f32_e32 v30, 1.0, v30
+; GFX7-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
+; GFX7-NEXT:    v_mul_f32_e32 v29, 1.0, v29
+; GFX7-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; GFX7-NEXT:    v_mul_f32_e32 v28, 1.0, v28
+; GFX7-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
+; GFX7-NEXT:    v_mul_f32_e32 v27, 1.0, v27
+; GFX7-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
+; GFX7-NEXT:    v_mul_f32_e32 v26, 1.0, v26
+; GFX7-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
+; GFX7-NEXT:    v_mul_f32_e32 v25, 1.0, v25
+; GFX7-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
+; GFX7-NEXT:    v_mul_f32_e32 v24, 1.0, v24
+; GFX7-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; GFX7-NEXT:    v_mul_f32_e32 v23, 1.0, v23
+; GFX7-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GFX7-NEXT:    v_mul_f32_e32 v22, 1.0, v22
+; GFX7-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GFX7-NEXT:    v_mul_f32_e32 v21, 1.0, v21
+; GFX7-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GFX7-NEXT:    v_mul_f32_e32 v20, 1.0, v20
+; GFX7-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
+; GFX7-NEXT:    v_mul_f32_e32 v19, 1.0, v19
+; GFX7-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GFX7-NEXT:    v_mul_f32_e32 v18, 1.0, v18
+; GFX7-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GFX7-NEXT:    v_mul_f32_e32 v17, 1.0, v17
+; GFX7-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; GFX7-NEXT:    v_mul_f32_e32 v16, 1.0, v16
+; GFX7-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GFX7-NEXT:    v_mul_f32_e32 v15, 1.0, v15
+; GFX7-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX7-NEXT:    v_mul_f32_e32 v14, 1.0, v14
+; GFX7-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX7-NEXT:    v_mul_f32_e32 v13, 1.0, v13
+; GFX7-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX7-NEXT:    v_mul_f32_e32 v12, 1.0, v12
+; GFX7-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX7-NEXT:    v_mul_f32_e32 v11, 1.0, v11
+; GFX7-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX7-NEXT:    v_mul_f32_e32 v10, 1.0, v10
+; GFX7-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX7-NEXT:    v_mul_f32_e32 v9, 1.0, v9
+; GFX7-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX7-NEXT:    v_mul_f32_e32 v8, 1.0, v8
+; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX7-NEXT:    v_mul_f32_e32 v7, 1.0, v7
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT:    v_mul_f32_e32 v6, 1.0, v6
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    s_waitcnt vmcnt(2)
+; GFX7-NEXT:    v_mul_f32_e32 v31, 1.0, v31
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GFX7-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
+; GFX7-NEXT:    v_fma_f32 v31, v31, v32, v33
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:124
+; GFX7-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:252
+; GFX7-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GFX7-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_fma_f32 v30, v30, v32, v33
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:120
+; GFX7-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:248
+; GFX7-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GFX7-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_fma_f32 v29, v29, v32, v33
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:116
+; GFX7-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:244
+; GFX7-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GFX7-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_fma_f32 v28, v28, v32, v33
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:112
+; GFX7-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:240
+; GFX7-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GFX7-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_fma_f32 v27, v27, v32, v33
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:108
+; GFX7-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:236
+; GFX7-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GFX7-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_fma_f32 v26, v26, v32, v33
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:104
+; GFX7-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:232
+; GFX7-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GFX7-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_fma_f32 v25, v25, v32, v33
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:100
+; GFX7-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:228
+; GFX7-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GFX7-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_fma_f32 v24, v24, v32, v33
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:96
+; GFX7-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:224
+; GFX7-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GFX7-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_fma_f32 v23, v23, v32, v33
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:92
+; GFX7-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:220
+; GFX7-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GFX7-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_fma_f32 v22, v22, v32, v33
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:88
+; GFX7-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:216
+; GFX7-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GFX7-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_fma_f32 v21, v21, v32, v33
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:84
+; GFX7-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:212
+; GFX7-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GFX7-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_fma_f32 v20, v20, v32, v33
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:80
+; GFX7-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:208
+; GFX7-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GFX7-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_fma_f32 v19, v19, v32, v33
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:76
+; GFX7-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:204
+; GFX7-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GFX7-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_fma_f32 v18, v18, v32, v33
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:72
+; GFX7-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:200
+; GFX7-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GFX7-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_fma_f32 v17, v17, v32, v33
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:68
+; GFX7-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:196
+; GFX7-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GFX7-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_fma_f32 v16, v16, v32, v33
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:64
+; GFX7-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:192
+; GFX7-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GFX7-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_fma_f32 v15, v15, v32, v33
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:60
+; GFX7-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:188
+; GFX7-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GFX7-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_fma_f32 v14, v14, v32, v33
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:56
+; GFX7-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:184
+; GFX7-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GFX7-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_fma_f32 v13, v13, v32, v33
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:52
+; GFX7-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:180
+; GFX7-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GFX7-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_fma_f32 v12, v12, v32, v33
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:48
+; GFX7-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:176
+; GFX7-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GFX7-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_fma_f32 v11, v11, v32, v33
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:44
+; GFX7-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:172
+; GFX7-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GFX7-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_fma_f32 v10, v10, v32, v33
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:40
+; GFX7-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:168
+; GFX7-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GFX7-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_fma_f32 v9, v9, v32, v33
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:36
+; GFX7-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:164
+; GFX7-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GFX7-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_fma_f32 v8, v8, v32, v33
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:32
+; GFX7-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:160
+; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GFX7-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_fma_f32 v7, v7, v32, v33
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:28
+; GFX7-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:156
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GFX7-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_fma_f32 v6, v6, v32, v33
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:24
+; GFX7-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:152
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GFX7-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_fma_f32 v5, v5, v32, v33
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:20
+; GFX7-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:148
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GFX7-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_fma_f32 v4, v4, v32, v33
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:16
+; GFX7-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:144
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GFX7-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_fma_f32 v3, v3, v32, v33
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:12
+; GFX7-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:140
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GFX7-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_fma_f32 v2, v2, v32, v33
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:8
+; GFX7-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:136
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GFX7-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_fma_f32 v1, v1, v32, v33
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
+; GFX7-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:132
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GFX7-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_fma_f32 v0, v0, v32, v33
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fma_v32bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:64
+; GFX8-NEXT:    buffer_load_dword v33, off, s[0:3], s32
+; GFX8-NEXT:    v_lshlrev_b32_e32 v31, 16, v15
+; GFX8-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX8-NEXT:    s_waitcnt vmcnt(1)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v34, 16, v32
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v35, 16, v33
+; GFX8-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GFX8-NEXT:    v_fma_f32 v15, v15, v33, v32
+; GFX8-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:60
+; GFX8-NEXT:    v_fma_f32 v31, v31, v35, v34
+; GFX8-NEXT:    v_lshlrev_b32_e32 v32, 16, v30
+; GFX8-NEXT:    v_lshlrev_b32_e32 v34, 16, v14
+; GFX8-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
+; GFX8-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v35, 16, v33
+; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GFX8-NEXT:    v_fma_f32 v14, v14, v30, v33
+; GFX8-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:56
+; GFX8-NEXT:    v_fma_f32 v32, v34, v32, v35
+; GFX8-NEXT:    v_lshlrev_b32_e32 v30, 16, v29
+; GFX8-NEXT:    v_lshlrev_b32_e32 v34, 16, v13
+; GFX8-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; GFX8-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v35, 16, v33
+; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GFX8-NEXT:    v_fma_f32 v13, v13, v29, v33
+; GFX8-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:52
+; GFX8-NEXT:    v_fma_f32 v30, v34, v30, v35
+; GFX8-NEXT:    v_lshlrev_b32_e32 v29, 16, v28
+; GFX8-NEXT:    v_lshlrev_b32_e32 v34, 16, v12
+; GFX8-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
+; GFX8-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v35, 16, v33
+; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GFX8-NEXT:    v_fma_f32 v12, v12, v28, v33
+; GFX8-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:48
+; GFX8-NEXT:    v_fma_f32 v29, v34, v29, v35
+; GFX8-NEXT:    v_lshlrev_b32_e32 v28, 16, v27
+; GFX8-NEXT:    v_lshlrev_b32_e32 v34, 16, v11
+; GFX8-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
+; GFX8-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v35, 16, v33
+; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GFX8-NEXT:    v_fma_f32 v11, v11, v27, v33
+; GFX8-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:44
+; GFX8-NEXT:    v_fma_f32 v28, v34, v28, v35
+; GFX8-NEXT:    v_lshlrev_b32_e32 v27, 16, v26
+; GFX8-NEXT:    v_lshlrev_b32_e32 v34, 16, v10
+; GFX8-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
+; GFX8-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v35, 16, v33
+; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GFX8-NEXT:    v_fma_f32 v10, v10, v26, v33
+; GFX8-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:40
+; GFX8-NEXT:    v_fma_f32 v27, v34, v27, v35
+; GFX8-NEXT:    v_lshlrev_b32_e32 v34, 16, v25
+; GFX8-NEXT:    v_lshlrev_b32_e32 v35, 16, v9
+; GFX8-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
+; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v26, 16, v33
+; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GFX8-NEXT:    v_fma_f32 v9, v9, v25, v33
+; GFX8-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:36
+; GFX8-NEXT:    v_fma_f32 v26, v35, v34, v26
+; GFX8-NEXT:    v_lshlrev_b32_e32 v34, 16, v24
+; GFX8-NEXT:    v_lshlrev_b32_e32 v35, 16, v8
+; GFX8-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; GFX8-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v25, 16, v33
+; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GFX8-NEXT:    v_fma_f32 v8, v8, v24, v33
+; GFX8-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:32
+; GFX8-NEXT:    v_fma_f32 v25, v35, v34, v25
+; GFX8-NEXT:    v_lshlrev_b32_e32 v34, 16, v23
+; GFX8-NEXT:    v_lshlrev_b32_e32 v35, 16, v7
+; GFX8-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v24, 16, v33
+; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GFX8-NEXT:    v_fma_f32 v7, v7, v23, v33
+; GFX8-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:28
+; GFX8-NEXT:    v_fma_f32 v24, v35, v34, v24
+; GFX8-NEXT:    v_lshlrev_b32_e32 v34, 16, v22
+; GFX8-NEXT:    v_lshlrev_b32_e32 v35, 16, v6
+; GFX8-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v23, 16, v33
+; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GFX8-NEXT:    v_fma_f32 v6, v6, v22, v33
+; GFX8-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:24
+; GFX8-NEXT:    v_fma_f32 v23, v35, v34, v23
+; GFX8-NEXT:    v_lshlrev_b32_e32 v34, 16, v21
+; GFX8-NEXT:    v_lshlrev_b32_e32 v35, 16, v5
+; GFX8-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v22, 16, v33
+; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GFX8-NEXT:    v_fma_f32 v5, v5, v21, v33
+; GFX8-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:20
+; GFX8-NEXT:    v_fma_f32 v22, v35, v34, v22
+; GFX8-NEXT:    v_lshlrev_b32_e32 v34, 16, v20
+; GFX8-NEXT:    v_lshlrev_b32_e32 v35, 16, v4
+; GFX8-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
+; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v21, 16, v33
+; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GFX8-NEXT:    v_fma_f32 v4, v4, v20, v33
+; GFX8-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:16
+; GFX8-NEXT:    v_fma_f32 v21, v35, v34, v21
+; GFX8-NEXT:    v_lshlrev_b32_e32 v34, 16, v19
+; GFX8-NEXT:    v_lshlrev_b32_e32 v35, 16, v3
+; GFX8-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v20, 16, v33
+; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GFX8-NEXT:    v_fma_f32 v3, v3, v19, v33
+; GFX8-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:12
+; GFX8-NEXT:    v_fma_f32 v20, v35, v34, v20
+; GFX8-NEXT:    v_lshlrev_b32_e32 v34, 16, v18
+; GFX8-NEXT:    v_lshlrev_b32_e32 v35, 16, v2
+; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v19, 16, v33
+; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GFX8-NEXT:    v_fma_f32 v2, v2, v18, v33
+; GFX8-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
+; GFX8-NEXT:    v_fma_f32 v19, v35, v34, v19
+; GFX8-NEXT:    v_lshlrev_b32_e32 v34, 16, v17
+; GFX8-NEXT:    v_lshlrev_b32_e32 v35, 16, v1
+; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v33
+; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GFX8-NEXT:    v_fma_f32 v1, v1, v17, v33
+; GFX8-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:4
+; GFX8-NEXT:    v_fma_f32 v18, v35, v34, v18
+; GFX8-NEXT:    v_lshlrev_b32_e32 v34, 16, v16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v35, 16, v0
+; GFX8-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v17
+; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; GFX8-NEXT:    v_fma_f32 v0, v0, v16, v17
+; GFX8-NEXT:    v_bfe_u32 v16, v31, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v16, vcc, v16, v31
+; GFX8-NEXT:    v_add_u32_e32 v16, vcc, s4, v16
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v31, v31
+; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v31
+; GFX8-NEXT:    v_cndmask_b32_e32 v16, v16, v17, vcc
+; GFX8-NEXT:    v_bfe_u32 v17, v15, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v17, vcc, v17, v15
+; GFX8-NEXT:    v_add_u32_e32 v17, vcc, s4, v17
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v15, v15
+; GFX8-NEXT:    v_or_b32_e32 v15, 0x400000, v15
+; GFX8-NEXT:    v_cndmask_b32_e32 v15, v17, v15, vcc
+; GFX8-NEXT:    v_bfe_u32 v17, v32, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v17, vcc, v17, v32
+; GFX8-NEXT:    v_add_u32_e32 v17, vcc, s4, v17
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v32, v32
+; GFX8-NEXT:    v_or_b32_e32 v31, 0x400000, v32
+; GFX8-NEXT:    v_cndmask_b32_e32 v17, v17, v31, vcc
+; GFX8-NEXT:    v_bfe_u32 v31, v14, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v31, vcc, v31, v14
+; GFX8-NEXT:    v_add_u32_e32 v31, vcc, s4, v31
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v14, v14
+; GFX8-NEXT:    v_or_b32_e32 v14, 0x400000, v14
+; GFX8-NEXT:    v_cndmask_b32_e32 v14, v31, v14, vcc
+; GFX8-NEXT:    v_bfe_u32 v31, v30, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v31, vcc, v31, v30
+; GFX8-NEXT:    v_add_u32_e32 v31, vcc, s4, v31
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v30, v30
+; GFX8-NEXT:    v_or_b32_e32 v30, 0x400000, v30
+; GFX8-NEXT:    v_cndmask_b32_e32 v30, v31, v30, vcc
+; GFX8-NEXT:    v_bfe_u32 v31, v13, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v31, vcc, v31, v13
+; GFX8-NEXT:    v_add_u32_e32 v31, vcc, s4, v31
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
+; GFX8-NEXT:    v_or_b32_e32 v13, 0x400000, v13
+; GFX8-NEXT:    v_cndmask_b32_e32 v13, v31, v13, vcc
+; GFX8-NEXT:    v_bfe_u32 v31, v29, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v31, vcc, v31, v29
+; GFX8-NEXT:    v_add_u32_e32 v31, vcc, s4, v31
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v29, v29
+; GFX8-NEXT:    v_or_b32_e32 v29, 0x400000, v29
+; GFX8-NEXT:    v_cndmask_b32_e32 v29, v31, v29, vcc
+; GFX8-NEXT:    v_bfe_u32 v31, v12, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v31, vcc, v31, v12
+; GFX8-NEXT:    v_add_u32_e32 v31, vcc, s4, v31
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v12, v12
+; GFX8-NEXT:    v_or_b32_e32 v12, 0x400000, v12
+; GFX8-NEXT:    v_cndmask_b32_e32 v12, v31, v12, vcc
+; GFX8-NEXT:    v_bfe_u32 v31, v28, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v31, vcc, v31, v28
+; GFX8-NEXT:    v_add_u32_e32 v31, vcc, s4, v31
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v28, v28
+; GFX8-NEXT:    v_or_b32_e32 v28, 0x400000, v28
+; GFX8-NEXT:    v_cndmask_b32_e32 v28, v31, v28, vcc
+; GFX8-NEXT:    v_bfe_u32 v31, v11, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v31, vcc, v31, v11
+; GFX8-NEXT:    v_add_u32_e32 v31, vcc, s4, v31
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
+; GFX8-NEXT:    v_or_b32_e32 v11, 0x400000, v11
+; GFX8-NEXT:    v_cndmask_b32_e32 v11, v31, v11, vcc
+; GFX8-NEXT:    v_bfe_u32 v31, v27, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v31, vcc, v31, v27
+; GFX8-NEXT:    v_add_u32_e32 v31, vcc, s4, v31
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v27, v27
+; GFX8-NEXT:    v_or_b32_e32 v27, 0x400000, v27
+; GFX8-NEXT:    v_cndmask_b32_e32 v27, v31, v27, vcc
+; GFX8-NEXT:    v_bfe_u32 v31, v10, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v31, vcc, v31, v10
+; GFX8-NEXT:    v_add_u32_e32 v31, vcc, s4, v31
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v10, v10
+; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v10
+; GFX8-NEXT:    v_cndmask_b32_e32 v10, v31, v10, vcc
+; GFX8-NEXT:    v_bfe_u32 v31, v26, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v31, vcc, v31, v26
+; GFX8-NEXT:    v_add_u32_e32 v31, vcc, s4, v31
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v26, v26
+; GFX8-NEXT:    v_or_b32_e32 v26, 0x400000, v26
+; GFX8-NEXT:    v_cndmask_b32_e32 v26, v31, v26, vcc
+; GFX8-NEXT:    v_bfe_u32 v31, v9, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v31, vcc, v31, v9
+; GFX8-NEXT:    v_add_u32_e32 v31, vcc, s4, v31
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
+; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v9
+; GFX8-NEXT:    v_cndmask_b32_e32 v9, v31, v9, vcc
+; GFX8-NEXT:    v_bfe_u32 v31, v25, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v31, vcc, v31, v25
+; GFX8-NEXT:    v_add_u32_e32 v31, vcc, s4, v31
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v25, v25
+; GFX8-NEXT:    v_or_b32_e32 v25, 0x400000, v25
+; GFX8-NEXT:    v_cndmask_b32_e32 v25, v31, v25, vcc
+; GFX8-NEXT:    v_bfe_u32 v31, v8, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v31, vcc, v31, v8
+; GFX8-NEXT:    v_add_u32_e32 v31, vcc, s4, v31
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v8, v8
+; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v8
+; GFX8-NEXT:    v_cndmask_b32_e32 v8, v31, v8, vcc
+; GFX8-NEXT:    v_bfe_u32 v31, v24, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v31, vcc, v31, v24
+; GFX8-NEXT:    v_add_u32_e32 v31, vcc, s4, v31
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v24, v24
+; GFX8-NEXT:    v_or_b32_e32 v24, 0x400000, v24
+; GFX8-NEXT:    v_cndmask_b32_e32 v24, v31, v24, vcc
+; GFX8-NEXT:    v_bfe_u32 v31, v7, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v31, vcc, v31, v7
+; GFX8-NEXT:    v_add_u32_e32 v31, vcc, s4, v31
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
+; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v7
+; GFX8-NEXT:    v_cndmask_b32_e32 v7, v31, v7, vcc
+; GFX8-NEXT:    v_bfe_u32 v31, v23, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v31, vcc, v31, v23
+; GFX8-NEXT:    v_add_u32_e32 v31, vcc, s4, v31
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v23, v23
+; GFX8-NEXT:    v_or_b32_e32 v23, 0x400000, v23
+; GFX8-NEXT:    v_cndmask_b32_e32 v23, v31, v23, vcc
+; GFX8-NEXT:    v_bfe_u32 v31, v6, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v31, vcc, v31, v6
+; GFX8-NEXT:    v_add_u32_e32 v31, vcc, s4, v31
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX8-NEXT:    v_or_b32_e32 v6, 0x400000, v6
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v31, v6, vcc
+; GFX8-NEXT:    v_bfe_u32 v31, v22, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v31, vcc, v31, v22
+; GFX8-NEXT:    v_add_u32_e32 v31, vcc, s4, v31
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v22, v22
+; GFX8-NEXT:    v_or_b32_e32 v22, 0x400000, v22
+; GFX8-NEXT:    v_cndmask_b32_e32 v22, v31, v22, vcc
+; GFX8-NEXT:    v_bfe_u32 v31, v5, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v31, vcc, v31, v5
+; GFX8-NEXT:    v_add_u32_e32 v31, vcc, s4, v31
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v31, v5, vcc
+; GFX8-NEXT:    v_bfe_u32 v31, v21, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v31, vcc, v31, v21
+; GFX8-NEXT:    v_add_u32_e32 v31, vcc, s4, v31
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v21, v21
+; GFX8-NEXT:    v_or_b32_e32 v21, 0x400000, v21
+; GFX8-NEXT:    v_cndmask_b32_e32 v21, v31, v21, vcc
+; GFX8-NEXT:    v_bfe_u32 v31, v4, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v31, vcc, v31, v4
+; GFX8-NEXT:    v_add_u32_e32 v31, vcc, s4, v31
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
+; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v4
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v31, v4, vcc
+; GFX8-NEXT:    v_bfe_u32 v31, v20, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v31, vcc, v31, v20
+; GFX8-NEXT:    v_add_u32_e32 v31, vcc, s4, v31
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v20, v20
+; GFX8-NEXT:    v_or_b32_e32 v20, 0x400000, v20
+; GFX8-NEXT:    v_cndmask_b32_e32 v20, v31, v20, vcc
+; GFX8-NEXT:    v_bfe_u32 v31, v3, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v31, vcc, v31, v3
+; GFX8-NEXT:    v_add_u32_e32 v31, vcc, s4, v31
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; GFX8-NEXT:    v_or_b32_e32 v3, 0x400000, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v31, v3, vcc
+; GFX8-NEXT:    v_bfe_u32 v31, v19, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v31, vcc, v31, v19
+; GFX8-NEXT:    v_add_u32_e32 v31, vcc, s4, v31
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v19, v19
+; GFX8-NEXT:    v_or_b32_e32 v19, 0x400000, v19
+; GFX8-NEXT:    v_cndmask_b32_e32 v19, v31, v19, vcc
+; GFX8-NEXT:    v_bfe_u32 v31, v2, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v31, vcc, v31, v2
+; GFX8-NEXT:    v_add_u32_e32 v31, vcc, s4, v31
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v31, v2, vcc
+; GFX8-NEXT:    v_bfe_u32 v31, v18, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v31, vcc, v31, v18
+; GFX8-NEXT:    v_add_u32_e32 v31, vcc, s4, v31
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
+; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v18
+; GFX8-NEXT:    v_cndmask_b32_e32 v18, v31, v18, vcc
+; GFX8-NEXT:    v_bfe_u32 v31, v1, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v31, vcc, v31, v1
+; GFX8-NEXT:    v_add_u32_e32 v31, vcc, s4, v31
+; GFX8-NEXT:    v_fma_f32 v33, v35, v34, v33
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; GFX8-NEXT:    v_or_b32_e32 v1, 0x400000, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v31, v1, vcc
+; GFX8-NEXT:    v_bfe_u32 v31, v33, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v31, vcc, v31, v33
+; GFX8-NEXT:    v_add_u32_e32 v31, vcc, s4, v31
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v33, v33
+; GFX8-NEXT:    v_or_b32_e32 v32, 0x400000, v33
+; GFX8-NEXT:    v_cndmask_b32_e32 v31, v31, v32, vcc
+; GFX8-NEXT:    v_bfe_u32 v32, v0, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v32, vcc, v32, v0
+; GFX8-NEXT:    v_add_u32_e32 v32, vcc, s4, v32
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX8-NEXT:    v_or_b32_e32 v0, 0x400000, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v32, v0, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
+; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
+; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
+; GFX8-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
+; GFX8-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
+; GFX8-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
+; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
+; GFX8-NEXT:    v_alignbit_b32 v0, v0, v31, 16
+; GFX8-NEXT:    v_alignbit_b32 v1, v1, v18, 16
+; GFX8-NEXT:    v_alignbit_b32 v2, v2, v19, 16
+; GFX8-NEXT:    v_alignbit_b32 v3, v3, v20, 16
+; GFX8-NEXT:    v_alignbit_b32 v4, v4, v21, 16
+; GFX8-NEXT:    v_alignbit_b32 v5, v5, v22, 16
+; GFX8-NEXT:    v_alignbit_b32 v6, v6, v23, 16
+; GFX8-NEXT:    v_alignbit_b32 v7, v7, v24, 16
+; GFX8-NEXT:    v_alignbit_b32 v8, v8, v25, 16
+; GFX8-NEXT:    v_alignbit_b32 v9, v9, v26, 16
+; GFX8-NEXT:    v_alignbit_b32 v10, v10, v27, 16
+; GFX8-NEXT:    v_alignbit_b32 v11, v11, v28, 16
+; GFX8-NEXT:    v_alignbit_b32 v12, v12, v29, 16
+; GFX8-NEXT:    v_alignbit_b32 v13, v13, v30, 16
+; GFX8-NEXT:    v_alignbit_b32 v14, v14, v17, 16
+; GFX8-NEXT:    v_alignbit_b32 v15, v15, v16, 16
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-LABEL: v_fma_v32bf16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:64
+; GFX900-NEXT:    buffer_load_dword v33, off, s[0:3], s32
+; GFX900-NEXT:    v_lshlrev_b32_e32 v31, 16, v15
+; GFX900-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX900-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX900-NEXT:    s_waitcnt vmcnt(1)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v34, 16, v32
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v35, 16, v33
+; GFX900-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX900-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GFX900-NEXT:    v_fma_f32 v15, v15, v33, v32
+; GFX900-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:60
+; GFX900-NEXT:    v_fma_f32 v31, v31, v35, v34
+; GFX900-NEXT:    v_lshlrev_b32_e32 v32, 16, v30
+; GFX900-NEXT:    v_lshlrev_b32_e32 v34, 16, v14
+; GFX900-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
+; GFX900-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v31, v31
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v35, 16, v33
+; GFX900-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GFX900-NEXT:    v_fma_f32 v14, v14, v30, v33
+; GFX900-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:56
+; GFX900-NEXT:    v_fma_f32 v32, v34, v32, v35
+; GFX900-NEXT:    v_lshlrev_b32_e32 v30, 16, v29
+; GFX900-NEXT:    v_lshlrev_b32_e32 v34, 16, v13
+; GFX900-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; GFX900-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v35, 16, v33
+; GFX900-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GFX900-NEXT:    v_fma_f32 v13, v13, v29, v33
+; GFX900-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:52
+; GFX900-NEXT:    v_fma_f32 v30, v34, v30, v35
+; GFX900-NEXT:    v_lshlrev_b32_e32 v29, 16, v28
+; GFX900-NEXT:    v_lshlrev_b32_e32 v34, 16, v12
+; GFX900-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
+; GFX900-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v35, 16, v33
+; GFX900-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GFX900-NEXT:    v_fma_f32 v12, v12, v28, v33
+; GFX900-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:48
+; GFX900-NEXT:    v_fma_f32 v29, v34, v29, v35
+; GFX900-NEXT:    v_lshlrev_b32_e32 v28, 16, v27
+; GFX900-NEXT:    v_lshlrev_b32_e32 v34, 16, v11
+; GFX900-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
+; GFX900-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v35, 16, v33
+; GFX900-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GFX900-NEXT:    v_fma_f32 v11, v11, v27, v33
+; GFX900-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:44
+; GFX900-NEXT:    v_fma_f32 v28, v34, v28, v35
+; GFX900-NEXT:    v_lshlrev_b32_e32 v27, 16, v26
+; GFX900-NEXT:    v_lshlrev_b32_e32 v34, 16, v10
+; GFX900-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
+; GFX900-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v35, 16, v33
+; GFX900-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GFX900-NEXT:    v_fma_f32 v10, v10, v26, v33
+; GFX900-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:40
+; GFX900-NEXT:    v_fma_f32 v27, v34, v27, v35
+; GFX900-NEXT:    v_lshlrev_b32_e32 v34, 16, v25
+; GFX900-NEXT:    v_lshlrev_b32_e32 v35, 16, v9
+; GFX900-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
+; GFX900-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v26, 16, v33
+; GFX900-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GFX900-NEXT:    v_fma_f32 v9, v9, v25, v33
+; GFX900-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:36
+; GFX900-NEXT:    v_fma_f32 v26, v35, v34, v26
+; GFX900-NEXT:    v_lshlrev_b32_e32 v34, 16, v24
+; GFX900-NEXT:    v_lshlrev_b32_e32 v35, 16, v8
+; GFX900-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; GFX900-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v25, 16, v33
+; GFX900-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GFX900-NEXT:    v_fma_f32 v8, v8, v24, v33
+; GFX900-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:32
+; GFX900-NEXT:    v_fma_f32 v25, v35, v34, v25
+; GFX900-NEXT:    v_lshlrev_b32_e32 v34, 16, v23
+; GFX900-NEXT:    v_lshlrev_b32_e32 v35, 16, v7
+; GFX900-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GFX900-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v24, 16, v33
+; GFX900-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GFX900-NEXT:    v_fma_f32 v7, v7, v23, v33
+; GFX900-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:28
+; GFX900-NEXT:    v_fma_f32 v24, v35, v34, v24
+; GFX900-NEXT:    v_lshlrev_b32_e32 v34, 16, v22
+; GFX900-NEXT:    v_lshlrev_b32_e32 v35, 16, v6
+; GFX900-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GFX900-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v23, 16, v33
+; GFX900-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GFX900-NEXT:    v_fma_f32 v6, v6, v22, v33
+; GFX900-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:24
+; GFX900-NEXT:    v_fma_f32 v23, v35, v34, v23
+; GFX900-NEXT:    v_lshlrev_b32_e32 v34, 16, v21
+; GFX900-NEXT:    v_lshlrev_b32_e32 v35, 16, v5
+; GFX900-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GFX900-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v22, 16, v33
+; GFX900-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GFX900-NEXT:    v_fma_f32 v5, v5, v21, v33
+; GFX900-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:20
+; GFX900-NEXT:    v_fma_f32 v22, v35, v34, v22
+; GFX900-NEXT:    v_lshlrev_b32_e32 v34, 16, v20
+; GFX900-NEXT:    v_lshlrev_b32_e32 v35, 16, v4
+; GFX900-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
+; GFX900-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v21, 16, v33
+; GFX900-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GFX900-NEXT:    v_fma_f32 v4, v4, v20, v33
+; GFX900-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:16
+; GFX900-NEXT:    v_fma_f32 v21, v35, v34, v21
+; GFX900-NEXT:    v_lshlrev_b32_e32 v34, 16, v19
+; GFX900-NEXT:    v_lshlrev_b32_e32 v35, 16, v3
+; GFX900-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GFX900-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v20, 16, v33
+; GFX900-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GFX900-NEXT:    v_fma_f32 v3, v3, v19, v33
+; GFX900-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:12
+; GFX900-NEXT:    v_fma_f32 v20, v35, v34, v20
+; GFX900-NEXT:    v_lshlrev_b32_e32 v34, 16, v18
+; GFX900-NEXT:    v_lshlrev_b32_e32 v35, 16, v2
+; GFX900-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GFX900-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v19, 16, v33
+; GFX900-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GFX900-NEXT:    v_fma_f32 v2, v2, v18, v33
+; GFX900-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
+; GFX900-NEXT:    v_fma_f32 v19, v35, v34, v19
+; GFX900-NEXT:    v_lshlrev_b32_e32 v34, 16, v17
+; GFX900-NEXT:    v_lshlrev_b32_e32 v35, 16, v1
+; GFX900-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; GFX900-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v18, 16, v33
+; GFX900-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GFX900-NEXT:    v_fma_f32 v1, v1, v17, v33
+; GFX900-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:4
+; GFX900-NEXT:    v_fma_f32 v18, v35, v34, v18
+; GFX900-NEXT:    v_lshlrev_b32_e32 v34, 16, v16
+; GFX900-NEXT:    v_lshlrev_b32_e32 v35, 16, v0
+; GFX900-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GFX900-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v17, 16, v33
+; GFX900-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GFX900-NEXT:    v_fma_f32 v0, v0, v16, v33
+; GFX900-NEXT:    v_bfe_u32 v16, v31, 16, 1
+; GFX900-NEXT:    v_add3_u32 v16, v16, v31, s4
+; GFX900-NEXT:    v_or_b32_e32 v31, 0x400000, v31
+; GFX900-NEXT:    v_cndmask_b32_e32 v16, v16, v31, vcc
+; GFX900-NEXT:    v_bfe_u32 v31, v15, 16, 1
+; GFX900-NEXT:    v_add3_u32 v31, v31, v15, s4
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v15, v15
+; GFX900-NEXT:    v_or_b32_e32 v15, 0x400000, v15
+; GFX900-NEXT:    v_cndmask_b32_e32 v15, v31, v15, vcc
+; GFX900-NEXT:    v_bfe_u32 v31, v32, 16, 1
+; GFX900-NEXT:    v_add3_u32 v31, v31, v32, s4
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v32, v32
+; GFX900-NEXT:    v_or_b32_e32 v32, 0x400000, v32
+; GFX900-NEXT:    v_cndmask_b32_e32 v31, v31, v32, vcc
+; GFX900-NEXT:    v_bfe_u32 v32, v14, 16, 1
+; GFX900-NEXT:    v_add3_u32 v32, v32, v14, s4
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v14, v14
+; GFX900-NEXT:    v_or_b32_e32 v14, 0x400000, v14
+; GFX900-NEXT:    v_cndmask_b32_e32 v14, v32, v14, vcc
+; GFX900-NEXT:    v_bfe_u32 v32, v30, 16, 1
+; GFX900-NEXT:    v_add3_u32 v32, v32, v30, s4
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v30, v30
+; GFX900-NEXT:    v_or_b32_e32 v30, 0x400000, v30
+; GFX900-NEXT:    v_cndmask_b32_e32 v30, v32, v30, vcc
+; GFX900-NEXT:    v_bfe_u32 v32, v13, 16, 1
+; GFX900-NEXT:    v_add3_u32 v32, v32, v13, s4
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
+; GFX900-NEXT:    v_or_b32_e32 v13, 0x400000, v13
+; GFX900-NEXT:    v_cndmask_b32_e32 v13, v32, v13, vcc
+; GFX900-NEXT:    v_bfe_u32 v32, v29, 16, 1
+; GFX900-NEXT:    v_add3_u32 v32, v32, v29, s4
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v29, v29
+; GFX900-NEXT:    v_or_b32_e32 v29, 0x400000, v29
+; GFX900-NEXT:    v_cndmask_b32_e32 v29, v32, v29, vcc
+; GFX900-NEXT:    v_bfe_u32 v32, v12, 16, 1
+; GFX900-NEXT:    v_add3_u32 v32, v32, v12, s4
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v12, v12
+; GFX900-NEXT:    v_or_b32_e32 v12, 0x400000, v12
+; GFX900-NEXT:    v_cndmask_b32_e32 v12, v32, v12, vcc
+; GFX900-NEXT:    v_bfe_u32 v32, v28, 16, 1
+; GFX900-NEXT:    v_add3_u32 v32, v32, v28, s4
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v28, v28
+; GFX900-NEXT:    v_or_b32_e32 v28, 0x400000, v28
+; GFX900-NEXT:    v_cndmask_b32_e32 v28, v32, v28, vcc
+; GFX900-NEXT:    v_bfe_u32 v32, v11, 16, 1
+; GFX900-NEXT:    v_add3_u32 v32, v32, v11, s4
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
+; GFX900-NEXT:    v_or_b32_e32 v11, 0x400000, v11
+; GFX900-NEXT:    v_cndmask_b32_e32 v11, v32, v11, vcc
+; GFX900-NEXT:    v_bfe_u32 v32, v27, 16, 1
+; GFX900-NEXT:    v_add3_u32 v32, v32, v27, s4
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v27, v27
+; GFX900-NEXT:    v_or_b32_e32 v27, 0x400000, v27
+; GFX900-NEXT:    v_cndmask_b32_e32 v27, v32, v27, vcc
+; GFX900-NEXT:    v_bfe_u32 v32, v10, 16, 1
+; GFX900-NEXT:    v_add3_u32 v32, v32, v10, s4
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v10, v10
+; GFX900-NEXT:    v_or_b32_e32 v10, 0x400000, v10
+; GFX900-NEXT:    v_cndmask_b32_e32 v10, v32, v10, vcc
+; GFX900-NEXT:    v_bfe_u32 v32, v26, 16, 1
+; GFX900-NEXT:    v_add3_u32 v32, v32, v26, s4
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v26, v26
+; GFX900-NEXT:    v_or_b32_e32 v26, 0x400000, v26
+; GFX900-NEXT:    v_cndmask_b32_e32 v26, v32, v26, vcc
+; GFX900-NEXT:    v_bfe_u32 v32, v9, 16, 1
+; GFX900-NEXT:    v_add3_u32 v32, v32, v9, s4
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
+; GFX900-NEXT:    v_or_b32_e32 v9, 0x400000, v9
+; GFX900-NEXT:    v_cndmask_b32_e32 v9, v32, v9, vcc
+; GFX900-NEXT:    v_bfe_u32 v32, v25, 16, 1
+; GFX900-NEXT:    v_add3_u32 v32, v32, v25, s4
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v25, v25
+; GFX900-NEXT:    v_or_b32_e32 v25, 0x400000, v25
+; GFX900-NEXT:    v_cndmask_b32_e32 v25, v32, v25, vcc
+; GFX900-NEXT:    v_bfe_u32 v32, v8, 16, 1
+; GFX900-NEXT:    v_add3_u32 v32, v32, v8, s4
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v8, v8
+; GFX900-NEXT:    v_or_b32_e32 v8, 0x400000, v8
+; GFX900-NEXT:    v_cndmask_b32_e32 v8, v32, v8, vcc
+; GFX900-NEXT:    v_bfe_u32 v32, v24, 16, 1
+; GFX900-NEXT:    v_add3_u32 v32, v32, v24, s4
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v24, v24
+; GFX900-NEXT:    v_or_b32_e32 v24, 0x400000, v24
+; GFX900-NEXT:    v_cndmask_b32_e32 v24, v32, v24, vcc
+; GFX900-NEXT:    v_bfe_u32 v32, v7, 16, 1
+; GFX900-NEXT:    v_add3_u32 v32, v32, v7, s4
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
+; GFX900-NEXT:    v_or_b32_e32 v7, 0x400000, v7
+; GFX900-NEXT:    v_cndmask_b32_e32 v7, v32, v7, vcc
+; GFX900-NEXT:    v_bfe_u32 v32, v23, 16, 1
+; GFX900-NEXT:    v_add3_u32 v32, v32, v23, s4
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v23, v23
+; GFX900-NEXT:    v_or_b32_e32 v23, 0x400000, v23
+; GFX900-NEXT:    v_cndmask_b32_e32 v23, v32, v23, vcc
+; GFX900-NEXT:    v_bfe_u32 v32, v6, 16, 1
+; GFX900-NEXT:    v_add3_u32 v32, v32, v6, s4
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX900-NEXT:    v_or_b32_e32 v6, 0x400000, v6
+; GFX900-NEXT:    v_cndmask_b32_e32 v6, v32, v6, vcc
+; GFX900-NEXT:    v_bfe_u32 v32, v22, 16, 1
+; GFX900-NEXT:    v_add3_u32 v32, v32, v22, s4
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v22, v22
+; GFX900-NEXT:    v_or_b32_e32 v22, 0x400000, v22
+; GFX900-NEXT:    v_cndmask_b32_e32 v22, v32, v22, vcc
+; GFX900-NEXT:    v_bfe_u32 v32, v5, 16, 1
+; GFX900-NEXT:    v_add3_u32 v32, v32, v5, s4
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX900-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; GFX900-NEXT:    v_cndmask_b32_e32 v5, v32, v5, vcc
+; GFX900-NEXT:    v_bfe_u32 v32, v21, 16, 1
+; GFX900-NEXT:    v_add3_u32 v32, v32, v21, s4
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v21, v21
+; GFX900-NEXT:    v_or_b32_e32 v21, 0x400000, v21
+; GFX900-NEXT:    v_cndmask_b32_e32 v21, v32, v21, vcc
+; GFX900-NEXT:    v_bfe_u32 v32, v4, 16, 1
+; GFX900-NEXT:    v_add3_u32 v32, v32, v4, s4
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
+; GFX900-NEXT:    v_or_b32_e32 v4, 0x400000, v4
+; GFX900-NEXT:    v_cndmask_b32_e32 v4, v32, v4, vcc
+; GFX900-NEXT:    v_bfe_u32 v32, v20, 16, 1
+; GFX900-NEXT:    v_add3_u32 v32, v32, v20, s4
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v20, v20
+; GFX900-NEXT:    v_or_b32_e32 v20, 0x400000, v20
+; GFX900-NEXT:    v_cndmask_b32_e32 v20, v32, v20, vcc
+; GFX900-NEXT:    v_bfe_u32 v32, v3, 16, 1
+; GFX900-NEXT:    v_add3_u32 v32, v32, v3, s4
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; GFX900-NEXT:    v_or_b32_e32 v3, 0x400000, v3
+; GFX900-NEXT:    v_cndmask_b32_e32 v3, v32, v3, vcc
+; GFX900-NEXT:    v_bfe_u32 v32, v19, 16, 1
+; GFX900-NEXT:    v_add3_u32 v32, v32, v19, s4
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v19, v19
+; GFX900-NEXT:    v_or_b32_e32 v19, 0x400000, v19
+; GFX900-NEXT:    v_cndmask_b32_e32 v19, v32, v19, vcc
+; GFX900-NEXT:    v_bfe_u32 v32, v2, 16, 1
+; GFX900-NEXT:    v_add3_u32 v32, v32, v2, s4
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; GFX900-NEXT:    v_or_b32_e32 v2, 0x400000, v2
+; GFX900-NEXT:    v_cndmask_b32_e32 v2, v32, v2, vcc
+; GFX900-NEXT:    v_bfe_u32 v32, v18, 16, 1
+; GFX900-NEXT:    v_add3_u32 v32, v32, v18, s4
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
+; GFX900-NEXT:    v_or_b32_e32 v18, 0x400000, v18
+; GFX900-NEXT:    v_cndmask_b32_e32 v18, v32, v18, vcc
+; GFX900-NEXT:    v_bfe_u32 v32, v1, 16, 1
+; GFX900-NEXT:    v_fma_f32 v17, v35, v34, v17
+; GFX900-NEXT:    v_add3_u32 v32, v32, v1, s4
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; GFX900-NEXT:    v_or_b32_e32 v1, 0x400000, v1
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v32, v1, vcc
+; GFX900-NEXT:    v_bfe_u32 v32, v17, 16, 1
+; GFX900-NEXT:    v_add3_u32 v32, v32, v17, s4
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v17, v17
+; GFX900-NEXT:    v_or_b32_e32 v17, 0x400000, v17
+; GFX900-NEXT:    v_cndmask_b32_e32 v17, v32, v17, vcc
+; GFX900-NEXT:    v_bfe_u32 v32, v0, 16, 1
+; GFX900-NEXT:    v_add3_u32 v32, v32, v0, s4
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT:    v_or_b32_e32 v0, 0x400000, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v32, v0, vcc
+; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX900-NEXT:    v_perm_b32 v0, v0, v17, s4
+; GFX900-NEXT:    v_perm_b32 v1, v1, v18, s4
+; GFX900-NEXT:    v_perm_b32 v2, v2, v19, s4
+; GFX900-NEXT:    v_perm_b32 v3, v3, v20, s4
+; GFX900-NEXT:    v_perm_b32 v4, v4, v21, s4
+; GFX900-NEXT:    v_perm_b32 v5, v5, v22, s4
+; GFX900-NEXT:    v_perm_b32 v6, v6, v23, s4
+; GFX900-NEXT:    v_perm_b32 v7, v7, v24, s4
+; GFX900-NEXT:    v_perm_b32 v8, v8, v25, s4
+; GFX900-NEXT:    v_perm_b32 v9, v9, v26, s4
+; GFX900-NEXT:    v_perm_b32 v10, v10, v27, s4
+; GFX900-NEXT:    v_perm_b32 v11, v11, v28, s4
+; GFX900-NEXT:    v_perm_b32 v12, v12, v29, s4
+; GFX900-NEXT:    v_perm_b32 v13, v13, v30, s4
+; GFX900-NEXT:    v_perm_b32 v14, v14, v31, s4
+; GFX900-NEXT:    v_perm_b32 v15, v15, v16, s4
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fma_v32bf16:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    scratch_load_dword v35, off, s32 offset:64
+; GFX950-NEXT:    scratch_load_dword v36, off, s32
+; GFX950-NEXT:    scratch_load_dword v38, off, s32 offset:60
+; GFX950-NEXT:    scratch_load_dword v39, off, s32 offset:56
+; GFX950-NEXT:    scratch_load_dword v48, off, s32 offset:52
+; GFX950-NEXT:    scratch_load_dword v49, off, s32 offset:48
+; GFX950-NEXT:    scratch_load_dword v50, off, s32 offset:44
+; GFX950-NEXT:    scratch_load_dword v51, off, s32 offset:40
+; GFX950-NEXT:    scratch_load_dword v52, off, s32 offset:36
+; GFX950-NEXT:    scratch_load_dword v53, off, s32 offset:32
+; GFX950-NEXT:    scratch_load_dword v54, off, s32 offset:28
+; GFX950-NEXT:    scratch_load_dword v31, off, s32 offset:4
+; GFX950-NEXT:    scratch_load_dword v32, off, s32 offset:8
+; GFX950-NEXT:    scratch_load_dword v33, off, s32 offset:12
+; GFX950-NEXT:    scratch_load_dword v34, off, s32 offset:16
+; GFX950-NEXT:    scratch_load_dword v37, off, s32 offset:20
+; GFX950-NEXT:    scratch_load_dword v55, off, s32 offset:24
+; GFX950-NEXT:    v_accvgpr_write_b32 a3, v43 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_write_b32 a5, v45 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_write_b32 a6, v46 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_write_b32 a8, v56 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_write_b32 a11, v59 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_write_b32 a13, v61 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_write_b32 a14, v62 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_write_b32 a15, v63 ; Reload Reuse
+; GFX950-NEXT:    v_and_b32_e32 v43, 0xffff0000, v14
+; GFX950-NEXT:    v_lshlrev_b32_e32 v45, 16, v14
+; GFX950-NEXT:    v_and_b32_e32 v46, 0xffff0000, v29
+; GFX950-NEXT:    v_lshlrev_b32_e32 v56, 16, v29
+; GFX950-NEXT:    v_and_b32_e32 v59, 0xffff0000, v12
+; GFX950-NEXT:    v_lshlrev_b32_e32 v61, 16, v12
+; GFX950-NEXT:    v_and_b32_e32 v62, 0xffff0000, v27
+; GFX950-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
+; GFX950-NEXT:    v_accvgpr_write_b32 a2, v42 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_write_b32 a4, v44 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_write_b32 a7, v47 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_write_b32 a9, v57 ; Reload Reuse
+; GFX950-NEXT:    v_and_b32_e32 v42, 0xffff0000, v30
+; GFX950-NEXT:    v_lshlrev_b32_e32 v44, 16, v30
+; GFX950-NEXT:    v_and_b32_e32 v47, 0xffff0000, v13
+; GFX950-NEXT:    v_lshlrev_b32_e32 v57, 16, v13
+; GFX950-NEXT:    v_accvgpr_write_b32 a0, v40 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_write_b32 a1, v41 ; Reload Reuse
+; GFX950-NEXT:    v_and_b32_e32 v40, 0xffff0000, v15
+; GFX950-NEXT:    v_lshlrev_b32_e32 v41, 16, v15
+; GFX950-NEXT:    v_accvgpr_write_b32 a10, v58 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_write_b32 a12, v60 ; Reload Reuse
+; GFX950-NEXT:    v_and_b32_e32 v58, 0xffff0000, v28
+; GFX950-NEXT:    v_lshlrev_b32_e32 v60, 16, v28
+; GFX950-NEXT:    s_waitcnt vmcnt(16)
+; GFX950-NEXT:    v_and_b32_e32 v15, 0xffff0000, v35
+; GFX950-NEXT:    s_waitcnt vmcnt(15)
+; GFX950-NEXT:    v_and_b32_e32 v12, 0xffff0000, v36
+; GFX950-NEXT:    v_lshlrev_b32_e32 v63, 16, v36
+; GFX950-NEXT:    s_waitcnt vmcnt(14)
+; GFX950-NEXT:    v_and_b32_e32 v14, 0xffff0000, v38
+; GFX950-NEXT:    v_lshlrev_b32_e32 v29, 16, v38
+; GFX950-NEXT:    s_waitcnt vmcnt(11)
+; GFX950-NEXT:    v_and_b32_e32 v36, 0xffff0000, v49
+; GFX950-NEXT:    v_and_b32_e32 v38, 0xffff0000, v11
+; GFX950-NEXT:    v_fmac_f32_e32 v36, v38, v62
+; GFX950-NEXT:    v_lshlrev_b32_e32 v38, 16, v49
+; GFX950-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX950-NEXT:    v_and_b32_e32 v13, 0xffff0000, v39
+; GFX950-NEXT:    v_lshlrev_b32_e32 v30, 16, v39
+; GFX950-NEXT:    v_fmac_f32_e32 v38, v11, v27
+; GFX950-NEXT:    s_waitcnt vmcnt(10)
+; GFX950-NEXT:    v_and_b32_e32 v11, 0xffff0000, v50
+; GFX950-NEXT:    v_and_b32_e32 v27, 0xffff0000, v26
+; GFX950-NEXT:    v_and_b32_e32 v39, 0xffff0000, v10
+; GFX950-NEXT:    v_fmac_f32_e32 v11, v39, v27
+; GFX950-NEXT:    v_lshlrev_b32_e32 v27, 16, v50
+; GFX950-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
+; GFX950-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX950-NEXT:    v_fmac_f32_e32 v27, v10, v26
+; GFX950-NEXT:    s_waitcnt vmcnt(9)
+; GFX950-NEXT:    v_and_b32_e32 v10, 0xffff0000, v51
+; GFX950-NEXT:    v_and_b32_e32 v26, 0xffff0000, v25
+; GFX950-NEXT:    v_and_b32_e32 v39, 0xffff0000, v9
+; GFX950-NEXT:    v_fmac_f32_e32 v10, v39, v26
+; GFX950-NEXT:    v_lshlrev_b32_e32 v26, 16, v51
+; GFX950-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
+; GFX950-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX950-NEXT:    v_fmac_f32_e32 v26, v9, v25
+; GFX950-NEXT:    s_waitcnt vmcnt(8)
+; GFX950-NEXT:    v_and_b32_e32 v9, 0xffff0000, v52
+; GFX950-NEXT:    v_and_b32_e32 v25, 0xffff0000, v24
+; GFX950-NEXT:    v_and_b32_e32 v39, 0xffff0000, v8
+; GFX950-NEXT:    v_fmac_f32_e32 v9, v39, v25
+; GFX950-NEXT:    v_lshlrev_b32_e32 v25, 16, v52
+; GFX950-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
+; GFX950-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX950-NEXT:    v_fmac_f32_e32 v25, v8, v24
+; GFX950-NEXT:    s_waitcnt vmcnt(7)
+; GFX950-NEXT:    v_and_b32_e32 v8, 0xffff0000, v53
+; GFX950-NEXT:    v_and_b32_e32 v24, 0xffff0000, v23
+; GFX950-NEXT:    v_and_b32_e32 v39, 0xffff0000, v7
+; GFX950-NEXT:    v_fmac_f32_e32 v8, v39, v24
+; GFX950-NEXT:    v_lshlrev_b32_e32 v24, 16, v53
+; GFX950-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
+; GFX950-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX950-NEXT:    v_fmac_f32_e32 v24, v7, v23
+; GFX950-NEXT:    s_waitcnt vmcnt(6)
+; GFX950-NEXT:    v_and_b32_e32 v7, 0xffff0000, v54
+; GFX950-NEXT:    v_and_b32_e32 v23, 0xffff0000, v22
+; GFX950-NEXT:    v_and_b32_e32 v39, 0xffff0000, v6
+; GFX950-NEXT:    v_fmac_f32_e32 v7, v39, v23
+; GFX950-NEXT:    v_lshlrev_b32_e32 v23, 16, v54
+; GFX950-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
+; GFX950-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX950-NEXT:    v_fmac_f32_e32 v23, v6, v22
+; GFX950-NEXT:    s_waitcnt vmcnt(0)
+; GFX950-NEXT:    v_and_b32_e32 v6, 0xffff0000, v55
+; GFX950-NEXT:    v_and_b32_e32 v22, 0xffff0000, v21
+; GFX950-NEXT:    v_and_b32_e32 v39, 0xffff0000, v5
+; GFX950-NEXT:    v_fmac_f32_e32 v6, v39, v22
+; GFX950-NEXT:    v_lshlrev_b32_e32 v22, 16, v55
+; GFX950-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
+; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX950-NEXT:    v_fmac_f32_e32 v22, v5, v21
+; GFX950-NEXT:    v_and_b32_e32 v5, 0xffff0000, v37
+; GFX950-NEXT:    v_and_b32_e32 v21, 0xffff0000, v20
+; GFX950-NEXT:    v_and_b32_e32 v39, 0xffff0000, v4
+; GFX950-NEXT:    v_fmac_f32_e32 v5, v39, v21
+; GFX950-NEXT:    v_lshlrev_b32_e32 v21, 16, v37
+; GFX950-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
+; GFX950-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX950-NEXT:    v_fmac_f32_e32 v21, v4, v20
+; GFX950-NEXT:    v_and_b32_e32 v4, 0xffff0000, v34
+; GFX950-NEXT:    v_and_b32_e32 v20, 0xffff0000, v19
+; GFX950-NEXT:    v_and_b32_e32 v37, 0xffff0000, v3
+; GFX950-NEXT:    v_fmac_f32_e32 v4, v37, v20
+; GFX950-NEXT:    v_lshlrev_b32_e32 v20, 16, v34
+; GFX950-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
+; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX950-NEXT:    v_fmac_f32_e32 v20, v3, v19
+; GFX950-NEXT:    v_and_b32_e32 v3, 0xffff0000, v33
+; GFX950-NEXT:    v_and_b32_e32 v19, 0xffff0000, v18
+; GFX950-NEXT:    v_and_b32_e32 v34, 0xffff0000, v2
+; GFX950-NEXT:    v_fmac_f32_e32 v3, v34, v19
+; GFX950-NEXT:    v_lshlrev_b32_e32 v19, 16, v33
+; GFX950-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
+; GFX950-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX950-NEXT:    v_fmac_f32_e32 v19, v2, v18
+; GFX950-NEXT:    v_and_b32_e32 v2, 0xffff0000, v32
+; GFX950-NEXT:    v_and_b32_e32 v18, 0xffff0000, v17
+; GFX950-NEXT:    v_and_b32_e32 v33, 0xffff0000, v1
+; GFX950-NEXT:    v_fmac_f32_e32 v2, v33, v18
+; GFX950-NEXT:    v_lshlrev_b32_e32 v18, 16, v32
+; GFX950-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
+; GFX950-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX950-NEXT:    v_fmac_f32_e32 v18, v1, v17
+; GFX950-NEXT:    v_and_b32_e32 v1, 0xffff0000, v31
+; GFX950-NEXT:    v_and_b32_e32 v17, 0xffff0000, v16
+; GFX950-NEXT:    v_and_b32_e32 v32, 0xffff0000, v0
+; GFX950-NEXT:    v_lshlrev_b32_e32 v28, 16, v35
+; GFX950-NEXT:    v_fmac_f32_e32 v15, v40, v12
+; GFX950-NEXT:    v_and_b32_e32 v12, 0xffff0000, v48
+; GFX950-NEXT:    v_lshlrev_b32_e32 v35, 16, v48
+; GFX950-NEXT:    v_fmac_f32_e32 v1, v32, v17
+; GFX950-NEXT:    v_lshlrev_b32_e32 v17, 16, v31
+; GFX950-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
+; GFX950-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT:    v_fmac_f32_e32 v28, v41, v63
+; GFX950-NEXT:    v_fmac_f32_e32 v14, v43, v42
+; GFX950-NEXT:    v_fmac_f32_e32 v29, v45, v44
+; GFX950-NEXT:    v_fmac_f32_e32 v13, v47, v46
+; GFX950-NEXT:    v_fmac_f32_e32 v30, v57, v56
+; GFX950-NEXT:    v_fmac_f32_e32 v12, v59, v58
+; GFX950-NEXT:    v_fmac_f32_e32 v35, v61, v60
+; GFX950-NEXT:    v_fmac_f32_e32 v17, v0, v16
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v0, v17, v1
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v1, v18, v2
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v2, v19, v3
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v3, v20, v4
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v4, v21, v5
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v5, v22, v6
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v6, v23, v7
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v7, v24, v8
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v8, v25, v9
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v9, v26, v10
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v10, v27, v11
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v11, v38, v36
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v12, v35, v12
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v13, v30, v13
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v14, v29, v14
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v15, v28, v15
+; GFX950-NEXT:    v_accvgpr_read_b32 v63, a15 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_read_b32 v62, a14 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_read_b32 v61, a13 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_read_b32 v60, a12 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_read_b32 v59, a11 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_read_b32 v58, a10 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_read_b32 v57, a9 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_read_b32 v56, a8 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_read_b32 v47, a7 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_read_b32 v46, a6 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_read_b32 v45, a5 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_read_b32 v44, a4 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_read_b32 v43, a3 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_read_b32 v42, a2 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_read_b32 v41, a1 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_read_b32 v40, a0 ; Reload Reuse
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fma_v32bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_clause 0x8
+; GFX10-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:64
+; GFX10-NEXT:    buffer_load_dword v33, off, s[0:3], s32
+; GFX10-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:60
+; GFX10-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:56
+; GFX10-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:52
+; GFX10-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:48
+; GFX10-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:44
+; GFX10-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:40
+; GFX10-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:36
+; GFX10-NEXT:    v_lshlrev_b32_e32 v49, 16, v15
+; GFX10-NEXT:    v_and_b32_e32 v51, 0xffff0000, v15
+; GFX10-NEXT:    v_and_b32_e32 v52, 0xffff0000, v10
+; GFX10-NEXT:    s_waitcnt vmcnt(8)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v31, 16, v32
+; GFX10-NEXT:    s_waitcnt vmcnt(7)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v50, 16, v33
+; GFX10-NEXT:    v_and_b32_e32 v15, 0xffff0000, v32
+; GFX10-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GFX10-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:32
+; GFX10-NEXT:    v_fmac_f32_e32 v31, v49, v50
+; GFX10-NEXT:    v_lshlrev_b32_e32 v49, 16, v30
+; GFX10-NEXT:    v_lshlrev_b32_e32 v50, 16, v14
+; GFX10-NEXT:    v_fmac_f32_e32 v15, v51, v32
+; GFX10-NEXT:    s_waitcnt vmcnt(7)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v32, 16, v34
+; GFX10-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
+; GFX10-NEXT:    v_and_b32_e32 v51, 0xffff0000, v14
+; GFX10-NEXT:    v_and_b32_e32 v14, 0xffff0000, v34
+; GFX10-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:28
+; GFX10-NEXT:    v_fmac_f32_e32 v32, v50, v49
+; GFX10-NEXT:    v_lshlrev_b32_e32 v49, 16, v29
+; GFX10-NEXT:    v_lshlrev_b32_e32 v50, 16, v13
+; GFX10-NEXT:    v_fmac_f32_e32 v14, v51, v30
+; GFX10-NEXT:    s_waitcnt vmcnt(7)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v30, 16, v35
+; GFX10-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; GFX10-NEXT:    v_and_b32_e32 v51, 0xffff0000, v13
+; GFX10-NEXT:    v_and_b32_e32 v13, 0xffff0000, v35
+; GFX10-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:24
+; GFX10-NEXT:    v_fmac_f32_e32 v30, v50, v49
+; GFX10-NEXT:    v_lshlrev_b32_e32 v49, 16, v28
+; GFX10-NEXT:    v_lshlrev_b32_e32 v50, 16, v12
+; GFX10-NEXT:    v_fmac_f32_e32 v13, v51, v29
+; GFX10-NEXT:    s_waitcnt vmcnt(7)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v29, 16, v36
+; GFX10-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
+; GFX10-NEXT:    v_and_b32_e32 v51, 0xffff0000, v12
+; GFX10-NEXT:    v_and_b32_e32 v12, 0xffff0000, v36
+; GFX10-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:20
+; GFX10-NEXT:    v_fmac_f32_e32 v29, v50, v49
+; GFX10-NEXT:    v_lshlrev_b32_e32 v49, 16, v27
+; GFX10-NEXT:    v_lshlrev_b32_e32 v50, 16, v11
+; GFX10-NEXT:    v_fmac_f32_e32 v12, v51, v28
+; GFX10-NEXT:    s_waitcnt vmcnt(7)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v28, 16, v37
+; GFX10-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
+; GFX10-NEXT:    v_and_b32_e32 v51, 0xffff0000, v11
+; GFX10-NEXT:    v_and_b32_e32 v11, 0xffff0000, v37
+; GFX10-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:16
+; GFX10-NEXT:    v_fmac_f32_e32 v28, v50, v49
+; GFX10-NEXT:    v_lshlrev_b32_e32 v49, 16, v26
+; GFX10-NEXT:    v_lshlrev_b32_e32 v50, 16, v10
+; GFX10-NEXT:    v_fmac_f32_e32 v11, v51, v27
+; GFX10-NEXT:    s_waitcnt vmcnt(7)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v27, 16, v38
+; GFX10-NEXT:    v_and_b32_e32 v51, 0xffff0000, v26
+; GFX10-NEXT:    v_and_b32_e32 v10, 0xffff0000, v38
+; GFX10-NEXT:    v_lshlrev_b32_e32 v38, 16, v25
+; GFX10-NEXT:    s_waitcnt vmcnt(6)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v26, 16, v39
+; GFX10-NEXT:    v_fmac_f32_e32 v27, v50, v49
+; GFX10-NEXT:    v_lshlrev_b32_e32 v49, 16, v9
+; GFX10-NEXT:    v_fmac_f32_e32 v10, v52, v51
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:12
+; GFX10-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:8
+; GFX10-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
+; GFX10-NEXT:    v_or_b32_e32 v52, 0x400000, v31
+; GFX10-NEXT:    v_fmac_f32_e32 v26, v49, v38
+; GFX10-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:4
+; GFX10-NEXT:    v_and_b32_e32 v49, 0xffff0000, v9
+; GFX10-NEXT:    v_and_b32_e32 v9, 0xffff0000, v39
+; GFX10-NEXT:    v_lshlrev_b32_e32 v39, 16, v24
+; GFX10-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; GFX10-NEXT:    v_cmp_u_f32_e64 s14, v31, v31
+; GFX10-NEXT:    v_fmac_f32_e32 v9, v49, v25
+; GFX10-NEXT:    v_lshlrev_b32_e32 v49, 16, v8
+; GFX10-NEXT:    s_waitcnt vmcnt(8)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v48
+; GFX10-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX10-NEXT:    v_and_b32_e32 v48, 0xffff0000, v48
+; GFX10-NEXT:    v_fmac_f32_e32 v25, v49, v39
+; GFX10-NEXT:    v_lshlrev_b32_e32 v39, 16, v23
+; GFX10-NEXT:    v_lshlrev_b32_e32 v49, 16, v7
+; GFX10-NEXT:    v_fmac_f32_e32 v48, v8, v24
+; GFX10-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX10-NEXT:    v_lshlrev_b32_e32 v24, 16, v22
+; GFX10-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v48, v48
+; GFX10-NEXT:    s_waitcnt vmcnt(7)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v33
+; GFX10-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GFX10-NEXT:    v_fmac_f32_e32 v8, v49, v39
+; GFX10-NEXT:    v_lshlrev_b32_e32 v39, 16, v6
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX10-NEXT:    v_fmac_f32_e32 v33, v7, v23
+; GFX10-NEXT:    v_lshlrev_b32_e32 v49, 16, v21
+; GFX10-NEXT:    s_waitcnt vmcnt(6)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v34
+; GFX10-NEXT:    v_and_b32_e32 v34, 0xffff0000, v34
+; GFX10-NEXT:    v_lshlrev_b32_e32 v23, 16, v5
+; GFX10-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX10-NEXT:    v_fmac_f32_e32 v7, v39, v24
+; GFX10-NEXT:    v_fmac_f32_e32 v34, v6, v22
+; GFX10-NEXT:    v_lshlrev_b32_e32 v24, 16, v20
+; GFX10-NEXT:    v_lshlrev_b32_e32 v39, 16, v4
+; GFX10-NEXT:    s_waitcnt vmcnt(5)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v35
+; GFX10-NEXT:    v_and_b32_e32 v35, 0xffff0000, v35
+; GFX10-NEXT:    v_lshlrev_b32_e32 v22, 16, v19
+; GFX10-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
+; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX10-NEXT:    v_fmac_f32_e32 v6, v23, v49
+; GFX10-NEXT:    v_fmac_f32_e32 v35, v5, v21
+; GFX10-NEXT:    v_lshlrev_b32_e32 v23, 16, v3
+; GFX10-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GFX10-NEXT:    s_waitcnt vmcnt(4)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v36
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX10-NEXT:    v_and_b32_e32 v36, 0xffff0000, v36
+; GFX10-NEXT:    v_lshlrev_b32_e32 v49, 16, v18
+; GFX10-NEXT:    v_lshlrev_b32_e32 v21, 16, v2
+; GFX10-NEXT:    v_fmac_f32_e32 v5, v39, v24
+; GFX10-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GFX10-NEXT:    v_fmac_f32_e32 v36, v4, v20
+; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v16
+; GFX10-NEXT:    s_waitcnt vmcnt(3)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v39, 16, v37
+; GFX10-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v24, 16, v17
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
+; GFX10-NEXT:    v_fmac_f32_e32 v39, v23, v22
+; GFX10-NEXT:    v_and_b32_e32 v23, 0xffff0000, v37
+; GFX10-NEXT:    v_lshlrev_b32_e32 v22, 16, v0
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX10-NEXT:    v_fmac_f32_e32 v23, v3, v19
+; GFX10-NEXT:    s_waitcnt vmcnt(2)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v37, 16, v50
+; GFX10-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v51
+; GFX10-NEXT:    v_and_b32_e32 v19, 0xffff0000, v51
+; GFX10-NEXT:    v_and_b32_e32 v50, 0xffff0000, v50
+; GFX10-NEXT:    v_cmp_u_f32_e64 s5, v33, v33
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v51, 16, v38
+; GFX10-NEXT:    v_and_b32_e32 v38, 0xffff0000, v38
+; GFX10-NEXT:    v_fmac_f32_e32 v37, v21, v49
+; GFX10-NEXT:    v_fmac_f32_e32 v50, v2, v18
+; GFX10-NEXT:    v_fmac_f32_e32 v19, v1, v17
+; GFX10-NEXT:    v_or_b32_e32 v1, 0x400000, v48
+; GFX10-NEXT:    v_fmac_f32_e32 v38, v0, v16
+; GFX10-NEXT:    v_bfe_u32 v0, v48, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v16, v33, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v2, v8, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v17, 0x400000, v33
+; GFX10-NEXT:    v_bfe_u32 v18, v7, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v21, v34, 16, 1
+; GFX10-NEXT:    v_add3_u32 v0, v0, v48, 0x7fff
+; GFX10-NEXT:    v_bfe_u32 v48, v35, 16, 1
+; GFX10-NEXT:    v_add3_u32 v16, v16, v33, 0x7fff
+; GFX10-NEXT:    v_bfe_u32 v33, v5, 16, 1
+; GFX10-NEXT:    v_fmac_f32_e32 v3, v4, v24
+; GFX10-NEXT:    v_fmac_f32_e32 v51, v22, v20
+; GFX10-NEXT:    v_or_b32_e32 v4, 0x400000, v8
+; GFX10-NEXT:    v_or_b32_e32 v20, 0x400000, v7
+; GFX10-NEXT:    v_or_b32_e32 v22, 0x400000, v34
+; GFX10-NEXT:    v_bfe_u32 v24, v6, 16, 1
+; GFX10-NEXT:    v_add3_u32 v2, v2, v8, 0x7fff
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v8, v8
+; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v35
+; GFX10-NEXT:    v_add3_u32 v18, v18, v7, 0x7fff
+; GFX10-NEXT:    v_cmp_u_f32_e64 s6, v7, v7
+; GFX10-NEXT:    v_or_b32_e32 v7, 0x400000, v5
+; GFX10-NEXT:    v_add3_u32 v21, v21, v34, 0x7fff
+; GFX10-NEXT:    v_cmp_u_f32_e64 s7, v34, v34
+; GFX10-NEXT:    v_bfe_u32 v34, v39, 16, 1
+; GFX10-NEXT:    v_add3_u32 v48, v48, v35, 0x7fff
+; GFX10-NEXT:    v_cmp_u_f32_e64 s9, v35, v35
+; GFX10-NEXT:    v_bfe_u32 v35, v23, 16, 1
+; GFX10-NEXT:    v_add3_u32 v33, v33, v5, 0x7fff
+; GFX10-NEXT:    v_cmp_u_f32_e64 s10, v5, v5
+; GFX10-NEXT:    v_bfe_u32 v5, v37, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v49, 0x400000, v6
+; GFX10-NEXT:    v_add3_u32 v24, v24, v6, 0x7fff
+; GFX10-NEXT:    v_cmp_u_f32_e64 s8, v6, v6
+; GFX10-NEXT:    v_or_b32_e32 v6, 0x400000, v39
+; GFX10-NEXT:    v_add3_u32 v34, v34, v39, 0x7fff
+; GFX10-NEXT:    v_cmp_u_f32_e64 s11, v39, v39
+; GFX10-NEXT:    v_or_b32_e32 v39, 0x400000, v23
+; GFX10-NEXT:    v_add3_u32 v35, v35, v23, 0x7fff
+; GFX10-NEXT:    v_cmp_u_f32_e64 s12, v23, v23
+; GFX10-NEXT:    v_or_b32_e32 v23, 0x400000, v37
+; GFX10-NEXT:    v_add3_u32 v5, v5, v37, 0x7fff
+; GFX10-NEXT:    v_cmp_u_f32_e64 s13, v37, v37
+; GFX10-NEXT:    v_bfe_u32 v37, v31, 16, 1
+; GFX10-NEXT:    v_cndmask_b32_e64 v53, v2, v4, s4
+; GFX10-NEXT:    v_bfe_u32 v4, v3, 16, 1
+; GFX10-NEXT:    v_cndmask_b32_e64 v16, v16, v17, s5
+; GFX10-NEXT:    v_cndmask_b32_e64 v17, v18, v20, s6
+; GFX10-NEXT:    v_add3_u32 v37, v37, v31, 0x7fff
+; GFX10-NEXT:    v_cndmask_b32_e64 v18, v21, v22, s7
+; GFX10-NEXT:    v_or_b32_e32 v20, 0x400000, v3
+; GFX10-NEXT:    v_bfe_u32 v22, v19, 16, 1
+; GFX10-NEXT:    v_add3_u32 v4, v4, v3, 0x7fff
+; GFX10-NEXT:    v_cndmask_b32_e64 v31, v37, v52, s14
+; GFX10-NEXT:    v_bfe_u32 v37, v15, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v52, 0x400000, v15
+; GFX10-NEXT:    v_cmp_u_f32_e64 s14, v15, v15
+; GFX10-NEXT:    v_cndmask_b32_e64 v21, v24, v49, s8
+; GFX10-NEXT:    v_or_b32_e32 v24, 0x400000, v19
+; GFX10-NEXT:    v_add3_u32 v37, v37, v15, 0x7fff
+; GFX10-NEXT:    v_cndmask_b32_e64 v7, v33, v7, s10
+; GFX10-NEXT:    v_bfe_u32 v33, v51, 16, 1
+; GFX10-NEXT:    v_add3_u32 v22, v22, v19, 0x7fff
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, v34, v6, s11
+; GFX10-NEXT:    v_cndmask_b32_e64 v15, v37, v52, s14
+; GFX10-NEXT:    v_bfe_u32 v37, v32, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v52, 0x400000, v32
+; GFX10-NEXT:    v_cmp_u_f32_e64 s14, v32, v32
+; GFX10-NEXT:    v_or_b32_e32 v34, 0x400000, v51
+; GFX10-NEXT:    v_cndmask_b32_e64 v35, v35, v39, s12
+; GFX10-NEXT:    v_add3_u32 v37, v37, v32, 0x7fff
+; GFX10-NEXT:    v_bfe_u32 v39, v38, 16, 1
+; GFX10-NEXT:    v_add3_u32 v33, v33, v51, 0x7fff
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, v5, v23, s13
+; GFX10-NEXT:    v_or_b32_e32 v23, 0x400000, v38
+; GFX10-NEXT:    v_cndmask_b32_e64 v32, v37, v52, s14
+; GFX10-NEXT:    v_bfe_u32 v37, v14, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v52, 0x400000, v14
+; GFX10-NEXT:    v_cmp_u_f32_e64 s14, v14, v14
+; GFX10-NEXT:    v_add3_u32 v39, v39, v38, 0x7fff
+; GFX10-NEXT:    v_or_b32_e32 v2, 0x400000, v50
+; GFX10-NEXT:    v_add3_u32 v37, v37, v14, 0x7fff
+; GFX10-NEXT:    v_cndmask_b32_e64 v8, v48, v8, s9
+; GFX10-NEXT:    v_perm_b32 v15, v15, v31, 0x7060302
+; GFX10-NEXT:    v_cndmask_b32_e64 v14, v37, v52, s14
+; GFX10-NEXT:    v_bfe_u32 v37, v30, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v52, 0x400000, v30
+; GFX10-NEXT:    v_cmp_u_f32_e64 s14, v30, v30
+; GFX10-NEXT:    v_perm_b32 v14, v14, v32, 0x7060302
+; GFX10-NEXT:    v_add3_u32 v37, v37, v30, 0x7fff
+; GFX10-NEXT:    v_cndmask_b32_e64 v30, v37, v52, s14
+; GFX10-NEXT:    v_bfe_u32 v37, v13, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v52, 0x400000, v13
+; GFX10-NEXT:    v_cmp_u_f32_e64 s14, v13, v13
+; GFX10-NEXT:    v_add3_u32 v37, v37, v13, 0x7fff
+; GFX10-NEXT:    v_cndmask_b32_e64 v13, v37, v52, s14
+; GFX10-NEXT:    v_bfe_u32 v37, v29, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v52, 0x400000, v29
+; GFX10-NEXT:    v_cmp_u_f32_e64 s14, v29, v29
+; GFX10-NEXT:    v_perm_b32 v13, v13, v30, 0x7060302
+; GFX10-NEXT:    v_add3_u32 v37, v37, v29, 0x7fff
+; GFX10-NEXT:    v_cndmask_b32_e64 v29, v37, v52, s14
+; GFX10-NEXT:    v_bfe_u32 v37, v12, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v52, 0x400000, v12
+; GFX10-NEXT:    v_cmp_u_f32_e64 s14, v12, v12
+; GFX10-NEXT:    v_add3_u32 v37, v37, v12, 0x7fff
+; GFX10-NEXT:    v_cndmask_b32_e64 v12, v37, v52, s14
+; GFX10-NEXT:    v_bfe_u32 v37, v28, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v52, 0x400000, v28
+; GFX10-NEXT:    v_cmp_u_f32_e64 s14, v28, v28
+; GFX10-NEXT:    v_perm_b32 v12, v12, v29, 0x7060302
+; GFX10-NEXT:    v_add3_u32 v37, v37, v28, 0x7fff
+; GFX10-NEXT:    v_cndmask_b32_e64 v28, v37, v52, s14
+; GFX10-NEXT:    v_bfe_u32 v37, v11, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v52, 0x400000, v11
+; GFX10-NEXT:    v_cmp_u_f32_e64 s14, v11, v11
+; GFX10-NEXT:    v_add3_u32 v37, v37, v11, 0x7fff
+; GFX10-NEXT:    v_cndmask_b32_e64 v11, v37, v52, s14
+; GFX10-NEXT:    v_bfe_u32 v37, v27, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v52, 0x400000, v27
+; GFX10-NEXT:    v_cmp_u_f32_e64 s14, v27, v27
+; GFX10-NEXT:    v_perm_b32 v11, v11, v28, 0x7060302
+; GFX10-NEXT:    v_add3_u32 v37, v37, v27, 0x7fff
+; GFX10-NEXT:    v_cndmask_b32_e64 v27, v37, v52, s14
+; GFX10-NEXT:    v_bfe_u32 v37, v10, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v52, 0x400000, v10
+; GFX10-NEXT:    v_cmp_u_f32_e64 s14, v10, v10
+; GFX10-NEXT:    v_add3_u32 v37, v37, v10, 0x7fff
+; GFX10-NEXT:    v_cndmask_b32_e64 v10, v37, v52, s14
+; GFX10-NEXT:    v_bfe_u32 v37, v26, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v52, 0x400000, v26
+; GFX10-NEXT:    v_cmp_u_f32_e64 s14, v26, v26
+; GFX10-NEXT:    v_perm_b32 v10, v10, v27, 0x7060302
+; GFX10-NEXT:    v_add3_u32 v37, v37, v26, 0x7fff
+; GFX10-NEXT:    v_cndmask_b32_e64 v26, v37, v52, s14
+; GFX10-NEXT:    v_bfe_u32 v37, v9, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v52, 0x400000, v9
+; GFX10-NEXT:    v_cmp_u_f32_e64 s14, v9, v9
+; GFX10-NEXT:    v_add3_u32 v37, v37, v9, 0x7fff
+; GFX10-NEXT:    v_cndmask_b32_e64 v9, v37, v52, s14
+; GFX10-NEXT:    v_bfe_u32 v37, v25, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v52, 0x400000, v25
+; GFX10-NEXT:    v_cmp_u_f32_e64 s14, v25, v25
+; GFX10-NEXT:    v_perm_b32 v9, v9, v26, 0x7060302
+; GFX10-NEXT:    v_add3_u32 v37, v37, v25, 0x7fff
+; GFX10-NEXT:    v_cndmask_b32_e64 v25, v37, v52, s14
+; GFX10-NEXT:    v_cndmask_b32_e32 v52, v0, v1, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX10-NEXT:    v_bfe_u32 v1, v50, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v37, v36, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v0, 0x400000, v36
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v4, v20, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX10-NEXT:    v_add3_u32 v1, v1, v50, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v37, v37, v36, 0x7fff
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v22, v24, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v51, v51
+; GFX10-NEXT:    v_cndmask_b32_e32 v19, v33, v34, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v38, v38
+; GFX10-NEXT:    v_cndmask_b32_e32 v20, v39, v23, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v50, v50
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v1, v2, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX10-NEXT:    v_perm_b32 v1, v4, v3, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v3, v35, v6, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v6, v18, v17, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v2, v2, v5, 0x7060302
+; GFX10-NEXT:    v_cndmask_b32_e32 v22, v37, v0, vcc_lo
+; GFX10-NEXT:    v_perm_b32 v0, v20, v19, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v5, v8, v21, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v8, v52, v25, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v4, v22, v7, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v7, v16, v53, 0x7060302
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11TRUE16-LABEL: v_fma_v32bf16:
+; GFX11TRUE16:       ; %bb.0:
+; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11TRUE16-NEXT:    s_clause 0x10
+; GFX11TRUE16-NEXT:    scratch_load_b32 v31, off, s32 offset:64
+; GFX11TRUE16-NEXT:    scratch_load_b32 v32, off, s32
+; GFX11TRUE16-NEXT:    scratch_load_b32 v33, off, s32 offset:60
+; GFX11TRUE16-NEXT:    scratch_load_b32 v34, off, s32 offset:56
+; GFX11TRUE16-NEXT:    scratch_load_b32 v35, off, s32 offset:52
+; GFX11TRUE16-NEXT:    scratch_load_b32 v36, off, s32 offset:48
+; GFX11TRUE16-NEXT:    scratch_load_b32 v37, off, s32 offset:44
+; GFX11TRUE16-NEXT:    scratch_load_b32 v38, off, s32 offset:40
+; GFX11TRUE16-NEXT:    scratch_load_b32 v39, off, s32 offset:36
+; GFX11TRUE16-NEXT:    scratch_load_b32 v48, off, s32 offset:32
+; GFX11TRUE16-NEXT:    scratch_load_b32 v49, off, s32 offset:28
+; GFX11TRUE16-NEXT:    scratch_load_b32 v50, off, s32 offset:24
+; GFX11TRUE16-NEXT:    scratch_load_b32 v51, off, s32 offset:20
+; GFX11TRUE16-NEXT:    scratch_load_b32 v52, off, s32 offset:16
+; GFX11TRUE16-NEXT:    scratch_load_b32 v53, off, s32 offset:12
+; GFX11TRUE16-NEXT:    scratch_load_b32 v54, off, s32 offset:8
+; GFX11TRUE16-NEXT:    scratch_load_b32 v55, off, s32 offset:4
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v99, 0xffff0000, v21
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v100, 0xffff0000, v5
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v101, 0xffff0000, v20
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v102, 0xffff0000, v4
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v115, 0xffff0000, v17
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v116, 0xffff0000, v1
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v97, 0xffff0000, v22
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v117, 0xffff0000, v16
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v118, 0xffff0000, v0
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v103, 0xffff0000, v19
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v112, 0xffff0000, v3
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v85, 0xffff0000, v24
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v113, 0xffff0000, v18
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v114, 0xffff0000, v2
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(16)
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v119, 0xffff0000, v31
+; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(15)
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v128, 0xffff0000, v32
+; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(14)
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v129, 0xffff0000, v33
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v33, 16, v33
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v68, 0xffff0000, v13
+; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(12)
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v131, 0xffff0000, v35
+; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(10)
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v133, 0xffff0000, v37
+; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(9)
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v134, 0xffff0000, v38
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v37, 16, v37
+; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(7)
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v144, 0xffff0000, v48
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v48, 16, v48
+; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(5)
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v146, 0xffff0000, v50
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v145, 0xffff0000, v49
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v49, 16, v49
+; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(4)
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v147, 0xffff0000, v51
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v51, 16, v51
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v96, 0xffff0000, v7
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v148, 0xffff0000, v55
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v87, 0xffff0000, v23
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v83, 0xffff0000, v25
+; GFX11TRUE16-NEXT:    v_dual_fmac_f32 v146, v100, v99 :: v_dual_lshlrev_b32 v25, 16, v25
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v55, 16, v55
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v98, 0xffff0000, v6
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v84, 0xffff0000, v9
+; GFX11TRUE16-NEXT:    v_fmac_f32_e32 v48, v7, v23
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v135, 0xffff0000, v39
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v39, 16, v39
+; GFX11TRUE16-NEXT:    v_fmac_f32_e32 v49, v6, v22
+; GFX11TRUE16-NEXT:    v_dual_fmac_f32 v134, v84, v83 :: v_dual_lshlrev_b32 v13, 16, v13
+; GFX11TRUE16-NEXT:    v_bfe_u32 v83, v146, 16, 1
+; GFX11TRUE16-NEXT:    v_dual_fmac_f32 v51, v4, v20 :: v_dual_fmac_f32 v148, v118, v117
+; GFX11TRUE16-NEXT:    v_dual_fmac_f32 v144, v96, v87 :: v_dual_and_b32 v81, 0xffff0000, v26
+; GFX11TRUE16-NEXT:    v_dual_fmac_f32 v55, v0, v16 :: v_dual_lshlrev_b32 v26, 16, v26
+; GFX11TRUE16-NEXT:    v_fmac_f32_e32 v145, v98, v97
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v84, 0x400000, v146
+; GFX11TRUE16-NEXT:    v_add3_u32 v83, v83, v146, 0x7fff
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v86, 0xffff0000, v8
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v82, 0xffff0000, v10
+; GFX11TRUE16-NEXT:    v_dual_fmac_f32 v147, v102, v101 :: v_dual_lshlrev_b32 v10, 16, v10
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v38, 16, v38
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v69, 0xffff0000, v28
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11TRUE16-NEXT:    v_dual_fmac_f32 v37, v10, v26 :: v_dual_lshlrev_b32 v28, 16, v28
+; GFX11TRUE16-NEXT:    v_fmac_f32_e32 v39, v8, v24
+; GFX11TRUE16-NEXT:    v_dual_fmac_f32 v133, v82, v81 :: v_dual_and_b32 v70, 0xffff0000, v12
+; GFX11TRUE16-NEXT:    v_bfe_u32 v97, v51, 16, 1
+; GFX11TRUE16-NEXT:    v_bfe_u32 v23, v37, 16, 1
+; GFX11TRUE16-NEXT:    v_dual_fmac_f32 v135, v86, v85 :: v_dual_lshlrev_b32 v12, 16, v12
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v35, 16, v35
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v80, 0xffff0000, v11
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v132, 0xffff0000, v36
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v36, 16, v36
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v50
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v22, 0x400000, v133
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v24, 0x400000, v37
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v98, 0x400000, v51
+; GFX11TRUE16-NEXT:    v_add3_u32 v23, v23, v37, 0x7fff
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v71, 0xffff0000, v27
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
+; GFX11TRUE16-NEXT:    v_add3_u32 v97, v97, v51, 0x7fff
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v31, 16, v31
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v32, 16, v32
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v64, 0xffff0000, v15
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v130, 0xffff0000, v34
+; GFX11TRUE16-NEXT:    v_dual_fmac_f32 v35, v12, v28 :: v_dual_lshlrev_b32 v34, 16, v34
+; GFX11TRUE16-NEXT:    v_fmac_f32_e32 v36, v11, v27
+; GFX11TRUE16-NEXT:    v_fmac_f32_e32 v50, v5, v21
+; GFX11TRUE16-NEXT:    v_dual_fmac_f32 v132, v80, v71 :: v_dual_and_b32 v67, 0xffff0000, v29
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v29, 16, v29
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11TRUE16-NEXT:    v_dual_fmac_f32 v130, v68, v67 :: v_dual_and_b32 v65, 0xffff0000, v30
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v20, 0x400000, v36
+; GFX11TRUE16-NEXT:    v_dual_fmac_f32 v34, v13, v29 :: v_dual_fmac_f32 v31, v15, v32
+; GFX11TRUE16-NEXT:    v_dual_fmac_f32 v119, v64, v128 :: v_dual_and_b32 v66, 0xffff0000, v14
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v64, 0xffff0000, v52
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v128, 0xffff0000, v53
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v53, 16, v53
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11TRUE16-NEXT:    v_dual_fmac_f32 v129, v66, v65 :: v_dual_lshlrev_b32 v30, 16, v30
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v52, 16, v52
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v32, 0xffff0000, v54
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v54, 16, v54
+; GFX11TRUE16-NEXT:    v_fmac_f32_e32 v64, v112, v103
+; GFX11TRUE16-NEXT:    v_fmac_f32_e32 v38, v9, v25
+; GFX11TRUE16-NEXT:    v_dual_fmac_f32 v131, v70, v69 :: v_dual_lshlrev_b32 v14, 16, v14
+; GFX11TRUE16-NEXT:    v_fmac_f32_e32 v53, v2, v18
+; GFX11TRUE16-NEXT:    v_bfe_u32 v0, v119, 16, 1
+; GFX11TRUE16-NEXT:    v_bfe_u32 v2, v31, 16, 1
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11TRUE16-NEXT:    v_dual_fmac_f32 v33, v14, v30 :: v_dual_fmac_f32 v52, v3, v19
+; GFX11TRUE16-NEXT:    v_fmac_f32_e32 v54, v1, v17
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v1, 0x400000, v119
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v31
+; GFX11TRUE16-NEXT:    v_bfe_u32 v4, v129, 16, 1
+; GFX11TRUE16-NEXT:    v_add3_u32 v0, v0, v119, 0x7fff
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v119, v119
+; GFX11TRUE16-NEXT:    v_add3_u32 v2, v2, v31, 0x7fff
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v31, v31
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v5, 0x400000, v129
+; GFX11TRUE16-NEXT:    v_bfe_u32 v6, v33, 16, 1
+; GFX11TRUE16-NEXT:    v_bfe_u32 v14, v132, 16, 1
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v15, v0, v1, vcc_lo
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e64 v149, v2, v3, s0
+; GFX11TRUE16-NEXT:    v_add3_u32 v2, v4, v129, 0x7fff
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v129, v129
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v33
+; GFX11TRUE16-NEXT:    v_bfe_u32 v8, v130, 16, 1
+; GFX11TRUE16-NEXT:    v_add3_u32 v3, v6, v33, 0x7fff
+; GFX11TRUE16-NEXT:    v_add3_u32 v150, v14, v132, 0x7fff
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v14, v2, v5, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v33, v33
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v130
+; GFX11TRUE16-NEXT:    v_bfe_u32 v10, v34, 16, 1
+; GFX11TRUE16-NEXT:    v_bfe_u32 v13, v35, 16, 1
+; GFX11TRUE16-NEXT:    v_add3_u32 v4, v8, v130, 0x7fff
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v33, v3, v7, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v130, v130
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v11, 0x400000, v34
+; GFX11TRUE16-NEXT:    v_bfe_u32 v12, v131, 16, 1
+; GFX11TRUE16-NEXT:    v_add3_u32 v6, v10, v34, 0x7fff
+; GFX11TRUE16-NEXT:    v_add3_u32 v10, v13, v35, 0x7fff
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v13, v4, v9, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v16, 0x400000, v131
+; GFX11TRUE16-NEXT:    v_add3_u32 v8, v12, v131, 0x7fff
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v17, 0x400000, v35
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v18, 0x400000, v132
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v34, v6, v11, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v131, v131
+; GFX11TRUE16-NEXT:    v_bfe_u32 v19, v36, 16, 1
+; GFX11TRUE16-NEXT:    v_bfe_u32 v21, v133, 16, 1
+; GFX11TRUE16-NEXT:    v_bfe_u32 v25, v134, 16, 1
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v26, 0x400000, v134
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v12, v8, v16, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11TRUE16-NEXT:    v_add3_u32 v19, v19, v36, 0x7fff
+; GFX11TRUE16-NEXT:    v_add3_u32 v21, v21, v133, 0x7fff
+; GFX11TRUE16-NEXT:    v_bfe_u32 v27, v38, 16, 1
+; GFX11TRUE16-NEXT:    v_add3_u32 v25, v25, v134, 0x7fff
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v16, v10, v17, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v132, v132
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v28, 0x400000, v38
+; GFX11TRUE16-NEXT:    v_bfe_u32 v29, v135, 16, 1
+; GFX11TRUE16-NEXT:    v_add3_u32 v27, v27, v38, 0x7fff
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v30, 0x400000, v135
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v11, v150, v18, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11TRUE16-NEXT:    v_bfe_u32 v65, v39, 16, 1
+; GFX11TRUE16-NEXT:    v_add3_u32 v29, v29, v135, 0x7fff
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v66, 0x400000, v39
+; GFX11TRUE16-NEXT:    v_bfe_u32 v67, v144, 16, 1
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v17, v19, v20, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v133, v133
+; GFX11TRUE16-NEXT:    v_add3_u32 v65, v65, v39, 0x7fff
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v68, 0x400000, v144
+; GFX11TRUE16-NEXT:    v_bfe_u32 v69, v48, 16, 1
+; GFX11TRUE16-NEXT:    v_add3_u32 v67, v67, v144, 0x7fff
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v10, v21, v22, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v37, v37
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v70, 0x400000, v48
+; GFX11TRUE16-NEXT:    v_bfe_u32 v71, v145, 16, 1
+; GFX11TRUE16-NEXT:    v_add3_u32 v69, v69, v48, 0x7fff
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v80, 0x400000, v145
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v18, v23, v24, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v134, v134
+; GFX11TRUE16-NEXT:    v_bfe_u32 v81, v49, 16, 1
+; GFX11TRUE16-NEXT:    v_add3_u32 v71, v71, v145, 0x7fff
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v82, 0x400000, v49
+; GFX11TRUE16-NEXT:    v_bfe_u32 v85, v50, 16, 1
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v9, v25, v26, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v38, v38
+; GFX11TRUE16-NEXT:    v_add3_u32 v81, v81, v49, 0x7fff
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v86, 0x400000, v50
+; GFX11TRUE16-NEXT:    v_bfe_u32 v87, v147, 16, 1
+; GFX11TRUE16-NEXT:    v_add3_u32 v85, v85, v50, 0x7fff
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v19, v27, v28, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v135, v135
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v96, 0x400000, v147
+; GFX11TRUE16-NEXT:    v_add3_u32 v87, v87, v147, 0x7fff
+; GFX11TRUE16-NEXT:    v_bfe_u32 v99, v64, 16, 1
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v100, 0x400000, v64
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v8, v29, v30, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v39, v39
+; GFX11TRUE16-NEXT:    v_bfe_u32 v101, v52, 16, 1
+; GFX11TRUE16-NEXT:    v_add3_u32 v99, v99, v64, 0x7fff
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v102, 0x400000, v52
+; GFX11TRUE16-NEXT:    v_bfe_u32 v117, v54, 16, 1
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v20, v65, v66, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v144, v144
+; GFX11TRUE16-NEXT:    v_add3_u32 v101, v101, v52, 0x7fff
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v118, 0x400000, v54
+; GFX11TRUE16-NEXT:    v_bfe_u32 v0, v55, 16, 1
+; GFX11TRUE16-NEXT:    v_add3_u32 v117, v117, v54, 0x7fff
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v7, v67, v68, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v48, v48
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v1, 0x400000, v55
+; GFX11TRUE16-NEXT:    v_add3_u32 v0, v0, v55, 0x7fff
+; GFX11TRUE16-NEXT:    v_bfe_u32 v119, v148, 16, 1
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v31, 0x400000, v148
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v21, v69, v70, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v145, v145
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v8.l, v20.h
+; GFX11TRUE16-NEXT:    v_add3_u32 v119, v119, v148, 0x7fff
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v9.l, v19.h
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v7.l, v21.h
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v6, v71, v80, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v49, v49
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v10.l, v18.h
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v11.l, v17.h
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v12.l, v16.h
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v13.l, v34.h
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v22, v81, v82, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v146, v146
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v14.l, v33.h
+; GFX11TRUE16-NEXT:    v_mov_b16_e64 v15.l, v149.h
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v6.l, v22.h
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v5, v83, v84, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v50, v50
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v23, v85, v86, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v147, v147
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v5.l, v23.h
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v4, v87, v96, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v51, v51
+; GFX11TRUE16-NEXT:    v_fmac_f32_e32 v128, v114, v113
+; GFX11TRUE16-NEXT:    v_bfe_u32 v113, v53, 16, 1
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v114, 0x400000, v53
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v24, v97, v98, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v64, v64
+; GFX11TRUE16-NEXT:    v_bfe_u32 v103, v128, 16, 1
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v112, 0x400000, v128
+; GFX11TRUE16-NEXT:    v_add3_u32 v113, v113, v53, 0x7fff
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v4.l, v24.h
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v3, v99, v100, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v52, v52
+; GFX11TRUE16-NEXT:    v_add3_u32 v103, v103, v128, 0x7fff
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v25, v101, v102, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v128, v128
+; GFX11TRUE16-NEXT:    v_fmac_f32_e32 v32, v116, v115
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v3.l, v25.h
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v2, v103, v112, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v53, v53
+; GFX11TRUE16-NEXT:    v_bfe_u32 v115, v32, 16, 1
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v116, 0x400000, v32
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v26, v113, v114, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v54, v54
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11TRUE16-NEXT:    v_add3_u32 v115, v115, v32, 0x7fff
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v2.l, v26.h
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v27, v117, v118, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v55, v55
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v28, v0, v1, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v32, v32
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v1, v115, v116, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v148, v148
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v1.l, v27.h
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v0, v119, v31, vcc_lo
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v0.l, v28.h
+; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11FAKE16-LABEL: v_fma_v32bf16:
+; GFX11FAKE16:       ; %bb.0:
+; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11FAKE16-NEXT:    s_clause 0x10
+; GFX11FAKE16-NEXT:    scratch_load_b32 v31, off, s32 offset:64
+; GFX11FAKE16-NEXT:    scratch_load_b32 v32, off, s32
+; GFX11FAKE16-NEXT:    scratch_load_b32 v33, off, s32 offset:60
+; GFX11FAKE16-NEXT:    scratch_load_b32 v34, off, s32 offset:56
+; GFX11FAKE16-NEXT:    scratch_load_b32 v35, off, s32 offset:52
+; GFX11FAKE16-NEXT:    scratch_load_b32 v36, off, s32 offset:48
+; GFX11FAKE16-NEXT:    scratch_load_b32 v37, off, s32 offset:44
+; GFX11FAKE16-NEXT:    scratch_load_b32 v38, off, s32 offset:40
+; GFX11FAKE16-NEXT:    scratch_load_b32 v39, off, s32 offset:36
+; GFX11FAKE16-NEXT:    scratch_load_b32 v48, off, s32 offset:32
+; GFX11FAKE16-NEXT:    scratch_load_b32 v49, off, s32 offset:28
+; GFX11FAKE16-NEXT:    scratch_load_b32 v50, off, s32 offset:24
+; GFX11FAKE16-NEXT:    scratch_load_b32 v51, off, s32 offset:20
+; GFX11FAKE16-NEXT:    scratch_load_b32 v52, off, s32 offset:16
+; GFX11FAKE16-NEXT:    scratch_load_b32 v53, off, s32 offset:12
+; GFX11FAKE16-NEXT:    scratch_load_b32 v54, off, s32 offset:8
+; GFX11FAKE16-NEXT:    scratch_load_b32 v55, off, s32 offset:4
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v99, 16, v21
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v100, 16, v5
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v97, 16, v22
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v101, 16, v20
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v102, 16, v4
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v117, 16, v16
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v118, 16, v0
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v87, 16, v23
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v98, 16, v6
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v103, 16, v19
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v112, 16, v3
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v85, 16, v24
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v113, 16, v18
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v114, 16, v2
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v115, 16, v17
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v116, 16, v1
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(15)
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v128, 16, v32
+; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(14)
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v129, 16, v33
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v68, 16, v13
+; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(12)
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v131, 16, v35
+; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(10)
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v133, 16, v37
+; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(9)
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v134, 16, v38
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v37, 0xffff0000, v37
+; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(7)
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v144, 16, v48
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v48, 0xffff0000, v48
+; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(5)
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v146, 16, v50
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v145, 16, v49
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v49, 0xffff0000, v49
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v84, 16, v9
+; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(4)
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v147, 16, v51
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v51, 0xffff0000, v51
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v96, 16, v7
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v83, 16, v25
+; GFX11FAKE16-NEXT:    v_dual_fmac_f32 v146, v100, v99 :: v_dual_and_b32 v25, 0xffff0000, v25
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v135, 16, v39
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v39, 0xffff0000, v39
+; GFX11FAKE16-NEXT:    v_dual_fmac_f32 v48, v7, v23 :: v_dual_fmac_f32 v49, v6, v22
+; GFX11FAKE16-NEXT:    v_dual_fmac_f32 v134, v84, v83 :: v_dual_and_b32 v13, 0xffff0000, v13
+; GFX11FAKE16-NEXT:    v_fmac_f32_e32 v51, v4, v20
+; GFX11FAKE16-NEXT:    v_dual_fmac_f32 v144, v96, v87 :: v_dual_lshlrev_b32 v81, 16, v26
+; GFX11FAKE16-NEXT:    v_dual_fmac_f32 v145, v98, v97 :: v_dual_and_b32 v26, 0xffff0000, v26
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v84, 0x400000, v146
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v86, 16, v8
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v82, 16, v10
+; GFX11FAKE16-NEXT:    v_dual_fmac_f32 v147, v102, v101 :: v_dual_and_b32 v10, 0xffff0000, v10
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v38, 0xffff0000, v38
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v69, 16, v28
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11FAKE16-NEXT:    v_dual_fmac_f32 v37, v10, v26 :: v_dual_and_b32 v28, 0xffff0000, v28
+; GFX11FAKE16-NEXT:    v_fmac_f32_e32 v39, v8, v24
+; GFX11FAKE16-NEXT:    v_dual_fmac_f32 v133, v82, v81 :: v_dual_lshlrev_b32 v70, 16, v12
+; GFX11FAKE16-NEXT:    v_dual_fmac_f32 v135, v86, v85 :: v_dual_and_b32 v12, 0xffff0000, v12
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v35, 0xffff0000, v35
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v80, 16, v11
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v132, 16, v36
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v36, 0xffff0000, v36
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v50, 0xffff0000, v50
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v22, 0x400000, v133
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v24, 0x400000, v37
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v71, 16, v27
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v130, 16, v34
+; GFX11FAKE16-NEXT:    v_dual_fmac_f32 v35, v12, v28 :: v_dual_and_b32 v34, 0xffff0000, v34
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11FAKE16-NEXT:    v_fmac_f32_e32 v36, v11, v27
+; GFX11FAKE16-NEXT:    v_fmac_f32_e32 v50, v5, v21
+; GFX11FAKE16-NEXT:    v_dual_fmac_f32 v132, v80, v71 :: v_dual_lshlrev_b32 v67, 16, v29
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v98, 0x400000, v51
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v119, 16, v31
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v64, 16, v15
+; GFX11FAKE16-NEXT:    v_dual_fmac_f32 v34, v13, v29 :: v_dual_and_b32 v15, 0xffff0000, v15
+; GFX11FAKE16-NEXT:    v_dual_fmac_f32 v130, v68, v67 :: v_dual_lshlrev_b32 v65, 16, v30
+; GFX11FAKE16-NEXT:    v_bfe_u32 v23, v37, 16, 1
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11FAKE16-NEXT:    v_dual_fmac_f32 v31, v15, v32 :: v_dual_lshlrev_b32 v66, 16, v14
+; GFX11FAKE16-NEXT:    v_fmac_f32_e32 v119, v64, v128
+; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(3)
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v64, 16, v52
+; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(2)
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v128, 16, v53
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v53, 0xffff0000, v53
+; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(1)
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v54
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v32, 0xffff0000, v54
+; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v54, 16, v55
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v55, 0xffff0000, v55
+; GFX11FAKE16-NEXT:    v_dual_fmac_f32 v129, v66, v65 :: v_dual_and_b32 v30, 0xffff0000, v30
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v52, 0xffff0000, v52
+; GFX11FAKE16-NEXT:    v_fmac_f32_e32 v64, v112, v103
+; GFX11FAKE16-NEXT:    v_fmac_f32_e32 v38, v9, v25
+; GFX11FAKE16-NEXT:    v_dual_fmac_f32 v131, v70, v69 :: v_dual_and_b32 v14, 0xffff0000, v14
+; GFX11FAKE16-NEXT:    v_fmac_f32_e32 v53, v2, v18
+; GFX11FAKE16-NEXT:    v_fmac_f32_e32 v55, v0, v16
+; GFX11FAKE16-NEXT:    v_bfe_u32 v0, v119, 16, 1
+; GFX11FAKE16-NEXT:    v_bfe_u32 v2, v31, 16, 1
+; GFX11FAKE16-NEXT:    v_dual_fmac_f32 v33, v14, v30 :: v_dual_fmac_f32 v52, v3, v19
+; GFX11FAKE16-NEXT:    v_fmac_f32_e32 v32, v1, v17
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v1, 0x400000, v119
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v3, 0x400000, v31
+; GFX11FAKE16-NEXT:    v_bfe_u32 v4, v129, 16, 1
+; GFX11FAKE16-NEXT:    v_add3_u32 v0, v0, v119, 0x7fff
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v119, v119
+; GFX11FAKE16-NEXT:    v_add3_u32 v2, v2, v31, 0x7fff
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v31, v31
+; GFX11FAKE16-NEXT:    v_fmac_f32_e32 v128, v114, v113
+; GFX11FAKE16-NEXT:    v_fmac_f32_e32 v54, v118, v117
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v5, 0x400000, v129
+; GFX11FAKE16-NEXT:    v_bfe_u32 v6, v33, 16, 1
+; GFX11FAKE16-NEXT:    v_bfe_u32 v10, v34, 16, 1
+; GFX11FAKE16-NEXT:    v_bfe_u32 v14, v35, 16, 1
+; GFX11FAKE16-NEXT:    v_bfe_u32 v19, v36, 16, 1
+; GFX11FAKE16-NEXT:    v_bfe_u32 v27, v38, 16, 1
+; GFX11FAKE16-NEXT:    v_bfe_u32 v65, v39, 16, 1
+; GFX11FAKE16-NEXT:    v_bfe_u32 v69, v48, 16, 1
+; GFX11FAKE16-NEXT:    v_bfe_u32 v81, v49, 16, 1
+; GFX11FAKE16-NEXT:    v_bfe_u32 v85, v50, 16, 1
+; GFX11FAKE16-NEXT:    v_bfe_u32 v97, v51, 16, 1
+; GFX11FAKE16-NEXT:    v_bfe_u32 v101, v52, 16, 1
+; GFX11FAKE16-NEXT:    v_bfe_u32 v113, v53, 16, 1
+; GFX11FAKE16-NEXT:    v_bfe_u32 v117, v32, 16, 1
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v148, v0, v1, vcc_lo
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e64 v149, v2, v3, s0
+; GFX11FAKE16-NEXT:    v_add3_u32 v2, v4, v129, 0x7fff
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v129, v129
+; GFX11FAKE16-NEXT:    v_fmac_f32_e32 v15, v116, v115
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v7, 0x400000, v33
+; GFX11FAKE16-NEXT:    v_bfe_u32 v8, v130, 16, 1
+; GFX11FAKE16-NEXT:    v_add3_u32 v3, v6, v33, 0x7fff
+; GFX11FAKE16-NEXT:    v_add3_u32 v6, v10, v34, 0x7fff
+; GFX11FAKE16-NEXT:    v_add3_u32 v10, v14, v35, 0x7fff
+; GFX11FAKE16-NEXT:    v_add3_u32 v14, v19, v36, 0x7fff
+; GFX11FAKE16-NEXT:    v_add3_u32 v19, v23, v37, 0x7fff
+; GFX11FAKE16-NEXT:    v_add3_u32 v23, v27, v38, 0x7fff
+; GFX11FAKE16-NEXT:    v_add3_u32 v27, v65, v39, 0x7fff
+; GFX11FAKE16-NEXT:    v_add3_u32 v65, v69, v48, 0x7fff
+; GFX11FAKE16-NEXT:    v_add3_u32 v69, v81, v49, 0x7fff
+; GFX11FAKE16-NEXT:    v_add3_u32 v81, v85, v50, 0x7fff
+; GFX11FAKE16-NEXT:    v_add3_u32 v85, v97, v51, 0x7fff
+; GFX11FAKE16-NEXT:    v_add3_u32 v97, v101, v52, 0x7fff
+; GFX11FAKE16-NEXT:    v_add3_u32 v101, v113, v53, 0x7fff
+; GFX11FAKE16-NEXT:    v_add3_u32 v113, v117, v32, 0x7fff
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v117, v2, v5, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v33, v33
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v130
+; GFX11FAKE16-NEXT:    v_bfe_u32 v12, v131, 16, 1
+; GFX11FAKE16-NEXT:    v_bfe_u32 v17, v132, 16, 1
+; GFX11FAKE16-NEXT:    v_bfe_u32 v21, v133, 16, 1
+; GFX11FAKE16-NEXT:    v_bfe_u32 v25, v134, 16, 1
+; GFX11FAKE16-NEXT:    v_bfe_u32 v29, v135, 16, 1
+; GFX11FAKE16-NEXT:    v_bfe_u32 v67, v144, 16, 1
+; GFX11FAKE16-NEXT:    v_bfe_u32 v71, v145, 16, 1
+; GFX11FAKE16-NEXT:    v_bfe_u32 v83, v146, 16, 1
+; GFX11FAKE16-NEXT:    v_bfe_u32 v87, v147, 16, 1
+; GFX11FAKE16-NEXT:    v_bfe_u32 v99, v64, 16, 1
+; GFX11FAKE16-NEXT:    v_bfe_u32 v103, v128, 16, 1
+; GFX11FAKE16-NEXT:    v_bfe_u32 v115, v15, 16, 1
+; GFX11FAKE16-NEXT:    v_bfe_u32 v119, v54, 16, 1
+; GFX11FAKE16-NEXT:    v_add3_u32 v4, v8, v130, 0x7fff
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v33, v3, v7, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v130, v130
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v11, 0x400000, v34
+; GFX11FAKE16-NEXT:    v_add3_u32 v8, v12, v131, 0x7fff
+; GFX11FAKE16-NEXT:    v_add3_u32 v12, v17, v132, 0x7fff
+; GFX11FAKE16-NEXT:    v_add3_u32 v17, v21, v133, 0x7fff
+; GFX11FAKE16-NEXT:    v_add3_u32 v21, v25, v134, 0x7fff
+; GFX11FAKE16-NEXT:    v_add3_u32 v25, v29, v135, 0x7fff
+; GFX11FAKE16-NEXT:    v_add3_u32 v29, v67, v144, 0x7fff
+; GFX11FAKE16-NEXT:    v_add3_u32 v67, v71, v145, 0x7fff
+; GFX11FAKE16-NEXT:    v_add3_u32 v71, v83, v146, 0x7fff
+; GFX11FAKE16-NEXT:    v_add3_u32 v83, v87, v147, 0x7fff
+; GFX11FAKE16-NEXT:    v_add3_u32 v87, v99, v64, 0x7fff
+; GFX11FAKE16-NEXT:    v_add3_u32 v99, v103, v128, 0x7fff
+; GFX11FAKE16-NEXT:    v_add3_u32 v103, v115, v15, 0x7fff
+; GFX11FAKE16-NEXT:    v_add3_u32 v115, v119, v54, 0x7fff
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v119, v4, v9, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v13, 0x400000, v131
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v16, 0x400000, v35
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v18, 0x400000, v132
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v20, 0x400000, v36
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v34, v6, v11, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v131, v131
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v26, 0x400000, v134
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v28, 0x400000, v38
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v30, 0x400000, v135
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v66, 0x400000, v39
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v13, v8, v13, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v68, 0x400000, v144
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v70, 0x400000, v48
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v80, 0x400000, v145
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v82, 0x400000, v49
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v16, v10, v16, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v132, v132
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v86, 0x400000, v50
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v96, 0x400000, v147
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v100, 0x400000, v64
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v102, 0x400000, v52
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v11, v12, v18, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v112, 0x400000, v128
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v116, 0x400000, v15
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v118, 0x400000, v32
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v31, 0x400000, v54
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v12, v14, v20, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v133, v133
+; GFX11FAKE16-NEXT:    v_bfe_u32 v0, v55, 16, 1
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v1, 0x400000, v55
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v114, 0x400000, v53
+; GFX11FAKE16-NEXT:    v_perm_b32 v11, v12, v11, 0x7060302
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v10, v17, v22, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v37, v37
+; GFX11FAKE16-NEXT:    v_add3_u32 v0, v0, v55, 0x7fff
+; GFX11FAKE16-NEXT:    v_perm_b32 v12, v16, v13, 0x7060302
+; GFX11FAKE16-NEXT:    v_perm_b32 v13, v34, v119, 0x7060302
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v14, v19, v24, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v134, v134
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11FAKE16-NEXT:    v_perm_b32 v10, v14, v10, 0x7060302
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v9, v21, v26, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v38, v38
+; GFX11FAKE16-NEXT:    v_perm_b32 v14, v33, v117, 0x7060302
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v17, v23, v28, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v135, v135
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX11FAKE16-NEXT:    v_perm_b32 v9, v17, v9, 0x7060302
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v8, v25, v30, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v39, v39
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v18, v27, v66, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v144, v144
+; GFX11FAKE16-NEXT:    v_perm_b32 v8, v18, v8, 0x7060302
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v7, v29, v68, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v48, v48
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v19, v65, v70, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v145, v145
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX11FAKE16-NEXT:    v_perm_b32 v7, v19, v7, 0x7060302
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v6, v67, v80, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v49, v49
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v20, v69, v82, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v146, v146
+; GFX11FAKE16-NEXT:    v_perm_b32 v6, v20, v6, 0x7060302
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v5, v71, v84, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v50, v50
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v21, v81, v86, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v147, v147
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11FAKE16-NEXT:    v_perm_b32 v5, v21, v5, 0x7060302
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v4, v83, v96, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v64, v64
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v3, v87, v100, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v52, v52
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v22, v97, v102, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v128, v128
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11FAKE16-NEXT:    v_perm_b32 v3, v22, v3, 0x7060302
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v2, v99, v112, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v15, v103, v116, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v32, v32
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v23, v113, v118, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v54, v54
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v24, v115, v31, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v55, v55
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v53, v53
+; GFX11FAKE16-NEXT:    v_perm_b32 v1, v23, v15, 0x7060302
+; GFX11FAKE16-NEXT:    v_perm_b32 v15, v149, v148, 0x7060302
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11FAKE16-NEXT:    v_perm_b32 v0, v0, v24, 0x7060302
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v25, v101, v114, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v51, v51
+; GFX11FAKE16-NEXT:    v_perm_b32 v2, v25, v2, 0x7060302
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v26, v85, v98, vcc_lo
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11FAKE16-NEXT:    v_perm_b32 v4, v26, v4, 0x7060302
+; GFX11FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX1250-LABEL: v_fma_v32bf16:
 ; GFX1250:       ; %bb.0:
-; GFX1250-NEXT:     s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT:     s_wait_kmcnt 0x0
-; GFX1250-NEXT:     s_clause 0x10
-; GFX1250-NEXT:     scratch_load_b32 v31, off, s32 offset:64
-; GFX1250-NEXT:     scratch_load_b32 v32, off, s32 offset:4
-; GFX1250-NEXT:     scratch_load_b32 v33, off, s32 offset:8
-; GFX1250-NEXT:     scratch_load_b32 v34, off, s32 offset:12
-; GFX1250-NEXT:     scratch_load_b32 v35, off, s32 offset:16
-; GFX1250-NEXT:     scratch_load_b32 v36, off, s32 offset:20
-; GFX1250-NEXT:     scratch_load_b32 v37, off, s32 offset:24
-; GFX1250-NEXT:     scratch_load_b32 v38, off, s32 offset:28
-; GFX1250-NEXT:     scratch_load_b32 v39, off, s32 offset:32
-; GFX1250-NEXT:     scratch_load_b32 v48, off, s32 offset:36
-; GFX1250-NEXT:     scratch_load_b32 v49, off, s32 offset:40
-; GFX1250-NEXT:     scratch_load_b32 v50, off, s32 offset:44
-; GFX1250-NEXT:     scratch_load_b32 v51, off, s32 offset:48
-; GFX1250-NEXT:     scratch_load_b32 v52, off, s32 offset:52
-; GFX1250-NEXT:     scratch_load_b32 v53, off, s32 offset:56
-; GFX1250-NEXT:     scratch_load_b32 v54, off, s32 offset:60
-; GFX1250-NEXT:     scratch_load_b32 v55, off, s32
-; GFX1250-NEXT:     s_wait_loadcnt 0xf
-; GFX1250-NEXT:     v_pk_fma_bf16 v0, v0, v16, v32
-; GFX1250-NEXT:     s_wait_loadcnt 0xe
-; GFX1250-NEXT:     v_pk_fma_bf16 v1, v1, v17, v33
-; GFX1250-NEXT:     s_wait_loadcnt 0xd
-; GFX1250-NEXT:     v_pk_fma_bf16 v2, v2, v18, v34
-; GFX1250-NEXT:     s_wait_loadcnt 0xc
-; GFX1250-NEXT:     v_pk_fma_bf16 v3, v3, v19, v35
-; GFX1250-NEXT:     s_wait_loadcnt 0xb
-; GFX1250-NEXT:     v_pk_fma_bf16 v4, v4, v20, v36
-; GFX1250-NEXT:     s_wait_loadcnt 0xa
-; GFX1250-NEXT:     v_pk_fma_bf16 v5, v5, v21, v37
-; GFX1250-NEXT:     s_wait_loadcnt 0x9
-; GFX1250-NEXT:     v_pk_fma_bf16 v6, v6, v22, v38
-; GFX1250-NEXT:     s_wait_loadcnt 0x8
-; GFX1250-NEXT:     v_pk_fma_bf16 v7, v7, v23, v39
-; GFX1250-NEXT:     s_wait_loadcnt 0x7
-; GFX1250-NEXT:     v_pk_fma_bf16 v8, v8, v24, v48
-; GFX1250-NEXT:     s_wait_loadcnt 0x6
-; GFX1250-NEXT:     v_pk_fma_bf16 v9, v9, v25, v49
-; GFX1250-NEXT:     s_wait_loadcnt 0x5
-; GFX1250-NEXT:     v_pk_fma_bf16 v10, v10, v26, v50
-; GFX1250-NEXT:     s_wait_loadcnt 0x4
-; GFX1250-NEXT:     v_pk_fma_bf16 v11, v11, v27, v51
-; GFX1250-NEXT:     s_wait_loadcnt 0x3
-; GFX1250-NEXT:     v_pk_fma_bf16 v12, v12, v28, v52
-; GFX1250-NEXT:     s_wait_loadcnt 0x2
-; GFX1250-NEXT:     v_pk_fma_bf16 v13, v13, v29, v53
-; GFX1250-NEXT:     s_wait_loadcnt 0x1
-; GFX1250-NEXT:     v_pk_fma_bf16 v14, v14, v30, v54
-; GFX1250-NEXT:     s_wait_loadcnt 0x0
-; GFX1250-NEXT:     v_pk_fma_bf16 v15, v15, v55, v31
-; GFX1250-NEXT:     s_set_pc_i64 s[30:31]
-define <32 x bfloat> @v_fma_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b, <32 x bfloat> %c) {
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_clause 0x10
+; GFX1250-NEXT:    scratch_load_b32 v31, off, s32 offset:64
+; GFX1250-NEXT:    scratch_load_b32 v32, off, s32 offset:4
+; GFX1250-NEXT:    scratch_load_b32 v33, off, s32 offset:8
+; GFX1250-NEXT:    scratch_load_b32 v34, off, s32 offset:12
+; GFX1250-NEXT:    scratch_load_b32 v35, off, s32 offset:16
+; GFX1250-NEXT:    scratch_load_b32 v36, off, s32 offset:20
+; GFX1250-NEXT:    scratch_load_b32 v37, off, s32 offset:24
+; GFX1250-NEXT:    scratch_load_b32 v38, off, s32 offset:28
+; GFX1250-NEXT:    scratch_load_b32 v39, off, s32 offset:32
+; GFX1250-NEXT:    scratch_load_b32 v48, off, s32 offset:36
+; GFX1250-NEXT:    scratch_load_b32 v49, off, s32 offset:40
+; GFX1250-NEXT:    scratch_load_b32 v50, off, s32 offset:44
+; GFX1250-NEXT:    scratch_load_b32 v51, off, s32 offset:48
+; GFX1250-NEXT:    scratch_load_b32 v52, off, s32 offset:52
+; GFX1250-NEXT:    scratch_load_b32 v53, off, s32 offset:56
+; GFX1250-NEXT:    scratch_load_b32 v54, off, s32 offset:60
+; GFX1250-NEXT:    scratch_load_b32 v55, off, s32
+; GFX1250-NEXT:    s_wait_loadcnt 0xf
+; GFX1250-NEXT:    v_pk_fma_bf16 v0, v0, v16, v32
+; GFX1250-NEXT:    s_wait_loadcnt 0xe
+; GFX1250-NEXT:    v_pk_fma_bf16 v1, v1, v17, v33
+; GFX1250-NEXT:    s_wait_loadcnt 0xd
+; GFX1250-NEXT:    v_pk_fma_bf16 v2, v2, v18, v34
+; GFX1250-NEXT:    s_wait_loadcnt 0xc
+; GFX1250-NEXT:    v_pk_fma_bf16 v3, v3, v19, v35
+; GFX1250-NEXT:    s_wait_loadcnt 0xb
+; GFX1250-NEXT:    v_pk_fma_bf16 v4, v4, v20, v36
+; GFX1250-NEXT:    s_wait_loadcnt 0xa
+; GFX1250-NEXT:    v_pk_fma_bf16 v5, v5, v21, v37
+; GFX1250-NEXT:    s_wait_loadcnt 0x9
+; GFX1250-NEXT:    v_pk_fma_bf16 v6, v6, v22, v38
+; GFX1250-NEXT:    s_wait_loadcnt 0x8
+; GFX1250-NEXT:    v_pk_fma_bf16 v7, v7, v23, v39
+; GFX1250-NEXT:    s_wait_loadcnt 0x7
+; GFX1250-NEXT:    v_pk_fma_bf16 v8, v8, v24, v48
+; GFX1250-NEXT:    s_wait_loadcnt 0x6
+; GFX1250-NEXT:    v_pk_fma_bf16 v9, v9, v25, v49
+; GFX1250-NEXT:    s_wait_loadcnt 0x5
+; GFX1250-NEXT:    v_pk_fma_bf16 v10, v10, v26, v50
+; GFX1250-NEXT:    s_wait_loadcnt 0x4
+; GFX1250-NEXT:    v_pk_fma_bf16 v11, v11, v27, v51
+; GFX1250-NEXT:    s_wait_loadcnt 0x3
+; GFX1250-NEXT:    v_pk_fma_bf16 v12, v12, v28, v52
+; GFX1250-NEXT:    s_wait_loadcnt 0x2
+; GFX1250-NEXT:    v_pk_fma_bf16 v13, v13, v29, v53
+; GFX1250-NEXT:    s_wait_loadcnt 0x1
+; GFX1250-NEXT:    v_pk_fma_bf16 v14, v14, v30, v54
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    v_pk_fma_bf16 v15, v15, v55, v31
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
   %op = call <32 x bfloat> @llvm.fma.v32bf16(<32 x bfloat> %a, <32 x bfloat> %b, <32 x bfloat> %c)
   ret <32 x bfloat> %op
 }
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.ll
index 12e9888..aaea4f7 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-combines.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.ll
@@ -5015,7 +5015,7 @@ define amdgpu_kernel void @v_fneg_fp_round_fneg_f64_to_f32(ptr addrspace(1) %out
   %a.gep = getelementptr inbounds double, ptr addrspace(1) %a.ptr, i64 %tid.ext
   %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
   %a = load volatile double, ptr addrspace(1) %a.gep
-  %fneg.a = fsub double -0.000000e+00, %a
+  %fneg.a = fsub nsz double -0.000000e+00, %a
   %fpround = fptrunc double %fneg.a to float
   %fneg = fneg float %fpround
   store float %fneg, ptr addrspace(1) %out.gep
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll
index c4ca79d..3de6df2 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll
@@ -4441,25 +4441,40 @@ define float @v_fneg_fabs_select_infloop_regression(float %arg, i1 %arg1) {
   ret float %i3
 }
 
-define float @v_fmul_0_fsub_0_infloop_regression(float %arg) {
-; GCN-SAFE-LABEL: v_fmul_0_fsub_0_infloop_regression:
+define float @v_fmul_0_fsub_0_safe_infloop_regression(float %arg) {
+; GCN-SAFE-LABEL: v_fmul_0_fsub_0_safe_infloop_regression:
 ; GCN-SAFE:       ; %bb.0: ; %bb
 ; GCN-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-SAFE-NEXT:    v_mul_f32_e32 v0, 0, v0
 ; GCN-SAFE-NEXT:    v_sub_f32_e32 v0, 0, v0
 ; GCN-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN-NSZ-LABEL: v_fmul_0_fsub_0_infloop_regression:
-; GCN-NSZ:       ; %bb.0: ; %bb
-; GCN-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NSZ-NEXT:    v_mul_f32_e32 v0, 0x80000000, v0
-; GCN-NSZ-NEXT:    s_setpc_b64 s[30:31]
+; SI-NSZ-LABEL: v_fmul_0_fsub_0_safe_infloop_regression:
+; SI-NSZ:       ; %bb.0: ; %bb
+; SI-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NSZ-NEXT:    s_brev_b32 s4, 1
+; SI-NSZ-NEXT:    v_fma_f32 v0, v0, s4, 0
+; SI-NSZ-NEXT:    s_setpc_b64 s[30:31]
+; FIXME: utils/update_llc_test_checks.py will generate redundant VI
+; labels, remove them, they will cause test failure.
 bb:
   %i = fmul float %arg, 0.0
   %i1 = fsub float 0.0, %i
   ret float %i1
 }
 
+define float @v_fmul_0_fsub_0_nsz_infloop_regression(float %arg) {
+; GCN-LABEL: v_fmul_0_fsub_0_nsz_infloop_regression:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_mul_f32_e32 v0, 0x80000000, v0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+bb:
+  %i = fmul float %arg, 0.0
+  %i1 = fsub nsz float 0.0, %i
+  ret float %i1
+}
+
 declare i32 @llvm.amdgcn.workitem.id.x() #1
 declare float @llvm.fma.f32(float, float, float) #1
 declare <2 x float> @llvm.fma.v2f32(<2 x float>, <2 x float>, <2 x float>)
diff --git a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-dead-intrinsics.ll b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-dead-intrinsics.ll
new file mode 100644
index 0000000..d6198f5
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-dead-intrinsics.ll
@@ -0,0 +1,9 @@
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-lower-buffer-fat-pointers < %s | FileCheck %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -passes=amdgpu-lower-buffer-fat-pointers < %s | FileCheck %s
+
+; CHECK: @arbitrary
+declare amdgpu_kernel void @arbitrary(ptr addrspace(1))
+
+; COM: This used to cause verifier errors when "lowered"
+declare <4 x i8> @llvm.masked.load.v4i8.p7(ptr addrspace(7) captures(none), i32 immarg, <4 x i1>, <4 x i8>)
+; CHECK-NOT: llvm.masked.load
diff --git a/llvm/test/CodeGen/AMDGPU/lro-phi-samebb-nonlookthrough-store.ll b/llvm/test/CodeGen/AMDGPU/lro-phi-samebb-nonlookthrough-store.ll
new file mode 100644
index 0000000..b508f73
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/lro-phi-samebb-nonlookthrough-store.ll
@@ -0,0 +1,46 @@
+; RUN: opt -S -passes=amdgpu-late-codegenprepare \
+; RUN:   -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a %s | FileCheck %s
+
+; Goal: With a loop-header PHI in illegal vector type and a same-BB
+; non-lookthrough user (vector add) in the header, LRO should still coerce
+; the PHI to i32 because a profitable sink (store) exists across BB.
+
+define amdgpu_kernel void @phi_samebb_nonlookthrough_store(
+    ptr addrspace(1) %out, <4 x i8> %v, i1 %exit) {
+; CHECK-LABEL: @phi_samebb_nonlookthrough_store(
+entry:
+  br label %loop
+
+loop:                                             ; preds = %entry, %loop
+  ; Loop-carried PHI in illegal vector type.
+  %acc = phi <4 x i8> [ zeroinitializer, %entry ], [ %acc.next, %loop ]
+
+  ; Same-BB non-lookthrough use in header.
+  %acc.next = add <4 x i8> %acc, %v
+
+  ; Make it a real loop: either iterate or exit to the sink block.
+  br i1 %exit, label %store, label %loop
+
+store:                                            ; preds = %loop
+  ; The across-BB sink: storing the PHI coerced to i32.
+  %acc.bc = bitcast <4 x i8> %acc to i32
+  store i32 %acc.bc, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+; After AMDGPULateCodeGenPrepare we expect:
+;  - PHI is coerced to i32
+;  - A header bitcast materializes for the add
+; This proves the same-BB non-lookthrough user (add) did not get pruned
+; when the def is a PHI.
+
+; CHECK: loop:
+; CHECK:   %[[ACC_TC:[^ ]+]] = phi i32
+; CHECK:   %[[ACC_TC_BC:[^ ]+]] = bitcast i32 %[[ACC_TC]] to <4 x i8>
+; CHECK:   %[[ACC_NEXT:[^ ]+]] = add <4 x i8> %[[ACC_TC_BC]], %v
+; CHECK:   br i1 %exit, label %store, label %loop
+; CHECK: store:
+; CHECK:   %[[ACC_TC_BC2:[^ ]+]] = bitcast i32 %[[ACC_TC]] to <4 x i8>
+; CHECK:   %[[ST_I32:[^ ]+]] = bitcast <4 x i8> %[[ACC_TC_BC2]] to i32
+; CHECK:   store i32 %[[ST_I32]],
+
diff --git a/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.v2f16.ll b/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.v2f16.ll
index 92d3277..bb22144 100644
--- a/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.v2f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.v2f16.ll
@@ -4148,28 +4148,28 @@ define <2 x half> @mul_select_negk_negfabs_v2f16(<2 x i32> %c, <2 x half> %x, <2
 ; --------------------------------------------------------------------------------
 
 define <2 x half> @select_fneg_posk_src_add_v2f16(<2 x i32> %c, <2 x half> %x, <2 x half> %y) {
-; CI-SAFE-LABEL: select_fneg_posk_src_add_v2f16:
-; CI-SAFE:       ; %bb.0:
-; CI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CI-SAFE-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; CI-SAFE-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; CI-SAFE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; CI-SAFE-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; CI-SAFE-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; CI-SAFE-NEXT:    v_add_f32_e32 v3, 4.0, v3
-; CI-SAFE-NEXT:    v_add_f32_e32 v2, 4.0, v2
-; CI-SAFE-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; CI-SAFE-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; CI-SAFE-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; CI-SAFE-NEXT:    v_or_b32_e32 v2, v2, v3
-; CI-SAFE-NEXT:    v_xor_b32_e32 v2, 0x80008000, v2
-; CI-SAFE-NEXT:    v_cvt_f32_f16_e32 v3, v2
-; CI-SAFE-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; CI-SAFE-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; CI-SAFE-NEXT:    v_cndmask_b32_e32 v0, 2.0, v3, vcc
-; CI-SAFE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; CI-SAFE-NEXT:    v_cndmask_b32_e32 v1, 2.0, v2, vcc
-; CI-SAFE-NEXT:    s_setpc_b64 s[30:31]
+; CI-LABEL: select_fneg_posk_src_add_v2f16:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; CI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; CI-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; CI-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; CI-NEXT:    v_add_f32_e32 v3, 4.0, v3
+; CI-NEXT:    v_add_f32_e32 v2, 4.0, v2
+; CI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; CI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; CI-NEXT:    v_or_b32_e32 v2, v2, v3
+; CI-NEXT:    v_xor_b32_e32 v2, 0x80008000, v2
+; CI-NEXT:    v_cvt_f32_f16_e32 v3, v2
+; CI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; CI-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; CI-NEXT:    v_cndmask_b32_e32 v0, 2.0, v3, vcc
+; CI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; CI-NEXT:    v_cndmask_b32_e32 v1, 2.0, v2, vcc
+; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SAFE-LABEL: select_fneg_posk_src_add_v2f16:
 ; VI-SAFE:       ; %bb.0:
@@ -4229,21 +4229,6 @@ define <2 x half> @select_fneg_posk_src_add_v2f16(<2 x i32> %c, <2 x half> %x, <
 ; GFX11-SAFE-FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
 ; GFX11-SAFE-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
-; CI-NSZ-LABEL: select_fneg_posk_src_add_v2f16:
-; CI-NSZ:       ; %bb.0:
-; CI-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CI-NSZ-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; CI-NSZ-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; CI-NSZ-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; CI-NSZ-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; CI-NSZ-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; CI-NSZ-NEXT:    v_sub_f32_e32 v2, -4.0, v2
-; CI-NSZ-NEXT:    v_sub_f32_e32 v3, -4.0, v3
-; CI-NSZ-NEXT:    v_cndmask_b32_e32 v0, 2.0, v2, vcc
-; CI-NSZ-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; CI-NSZ-NEXT:    v_cndmask_b32_e32 v1, 2.0, v3, vcc
-; CI-NSZ-NEXT:    s_setpc_b64 s[30:31]
-;
 ; VI-NSZ-LABEL: select_fneg_posk_src_add_v2f16:
 ; VI-NSZ:       ; %bb.0:
 ; VI-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4302,6 +4287,105 @@ define <2 x half> @select_fneg_posk_src_add_v2f16(<2 x i32> %c, <2 x half> %x, <
   ret <2 x half> %select
 }
 
+define <2 x half> @select_fneg_posk_src_add_v2f16_nsz(<2 x i32> %c, <2 x half> %x, <2 x half> %y) {
+; CI-LABEL: select_fneg_posk_src_add_v2f16_nsz:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; CI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; CI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; CI-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; CI-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; CI-NEXT:    v_sub_f32_e32 v2, -4.0, v2
+; CI-NEXT:    v_sub_f32_e32 v3, -4.0, v3
+; CI-NEXT:    v_cndmask_b32_e32 v0, 2.0, v2, vcc
+; CI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; CI-NEXT:    v_cndmask_b32_e32 v1, 2.0, v3, vcc
+; CI-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: select_fneg_posk_src_add_v2f16_nsz:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; VI-NEXT:    v_mov_b32_e32 v1, 0xc400
+; VI-NEXT:    v_sub_f16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; VI-NEXT:    v_sub_f16_e32 v2, -4.0, v2
+; VI-NEXT:    v_mov_b32_e32 v3, 0x4000
+; VI-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v0
+; VI-NEXT:    v_cndmask_b32_e64 v0, v3, v2, s[4:5]
+; VI-NEXT:    v_cndmask_b32_sdwa v1, v3, v1, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: select_fneg_posk_src_add_v2f16_nsz:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; GFX9-NEXT:    v_pk_add_f16 v1, v2, -4.0 op_sel_hi:[1,0] neg_lo:[1,0] neg_hi:[1,0]
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0x4000
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v0
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, v1, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SAFE-TRUE16-LABEL: select_fneg_posk_src_add_v2f16_nsz:
+; GFX11-SAFE-TRUE16:       ; %bb.0:
+; GFX11-SAFE-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-SAFE-TRUE16-NEXT:    v_pk_add_f16 v0, v2, -4.0 op_sel_hi:[1,0] neg_lo:[1,0] neg_hi:[1,0]
+; GFX11-SAFE-TRUE16-NEXT:    v_cmp_eq_u32_e64 s0, 0, v1
+; GFX11-SAFE-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SAFE-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0x4000, v0.l, vcc_lo
+; GFX11-SAFE-TRUE16-NEXT:    v_cndmask_b16 v0.h, 0x4000, v0.h, s0
+; GFX11-SAFE-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SAFE-FAKE16-LABEL: select_fneg_posk_src_add_v2f16_nsz:
+; GFX11-SAFE-FAKE16:       ; %bb.0:
+; GFX11-SAFE-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-FAKE16-NEXT:    v_pk_add_f16 v2, v2, -4.0 op_sel_hi:[1,0] neg_lo:[1,0] neg_hi:[1,0]
+; GFX11-SAFE-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-SAFE-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-SAFE-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; GFX11-SAFE-FAKE16-NEXT:    v_cndmask_b32_e32 v0, 0x4000, v2, vcc_lo
+; GFX11-SAFE-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-SAFE-FAKE16-NEXT:    v_cndmask_b32_e32 v1, 0x4000, v3, vcc_lo
+; GFX11-SAFE-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SAFE-FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-SAFE-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-TRUE16-LABEL: select_fneg_posk_src_add_v2f16_nsz:
+; GFX11-NSZ-TRUE16:       ; %bb.0:
+; GFX11-NSZ-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-NSZ-TRUE16-NEXT:    v_pk_add_f16 v0, v2, -4.0 op_sel_hi:[1,0] neg_lo:[1,0] neg_hi:[1,0]
+; GFX11-NSZ-TRUE16-NEXT:    v_cmp_eq_u32_e64 s0, 0, v1
+; GFX11-NSZ-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NSZ-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0x4000, v0.l, vcc_lo
+; GFX11-NSZ-TRUE16-NEXT:    v_cndmask_b16 v0.h, 0x4000, v0.h, s0
+; GFX11-NSZ-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-FAKE16-LABEL: select_fneg_posk_src_add_v2f16_nsz:
+; GFX11-NSZ-FAKE16:       ; %bb.0:
+; GFX11-NSZ-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-FAKE16-NEXT:    v_pk_add_f16 v2, v2, -4.0 op_sel_hi:[1,0] neg_lo:[1,0] neg_hi:[1,0]
+; GFX11-NSZ-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-NSZ-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NSZ-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; GFX11-NSZ-FAKE16-NEXT:    v_cndmask_b32_e32 v0, 0x4000, v2, vcc_lo
+; GFX11-NSZ-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-NSZ-FAKE16-NEXT:    v_cndmask_b32_e32 v1, 0x4000, v3, vcc_lo
+; GFX11-NSZ-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NSZ-FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-NSZ-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+  %cmp = icmp eq <2 x i32> %c, zeroinitializer
+  %add = fadd nsz <2 x half> %x, <half 4.0, half 4.0>
+  %fneg = fneg <2 x half> %add
+  %select = select <2 x i1> %cmp, <2 x half> %fneg, <2 x half> <half 2.0, half 2.0>
+  ret <2 x half> %select
+}
+
 define <2 x half> @select_fneg_posk_src_sub_v2f16(<2 x i32> %c, <2 x half> %x) {
 ; CI-SAFE-LABEL: select_fneg_posk_src_sub_v2f16:
 ; CI-SAFE:       ; %bb.0:
@@ -4704,34 +4788,34 @@ define <2 x half> @select_fneg_posk_src_fma_v2f16(<2 x i32> %c, <2 x half> %x, <
 }
 
 define <2 x half> @select_fneg_posk_src_fmad_v2f16(<2 x i32> %c, <2 x half> %x, <2 x half> %z) {
-; CI-SAFE-LABEL: select_fneg_posk_src_fmad_v2f16:
-; CI-SAFE:       ; %bb.0:
-; CI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CI-SAFE-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; CI-SAFE-NEXT:    v_cvt_f16_f32_e32 v5, v5
-; CI-SAFE-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; CI-SAFE-NEXT:    v_cvt_f16_f32_e32 v4, v4
-; CI-SAFE-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; CI-SAFE-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; CI-SAFE-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; CI-SAFE-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; CI-SAFE-NEXT:    v_mul_f32_e32 v3, 4.0, v3
-; CI-SAFE-NEXT:    v_add_f32_e32 v3, v3, v5
-; CI-SAFE-NEXT:    v_mul_f32_e32 v2, 4.0, v2
-; CI-SAFE-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; CI-SAFE-NEXT:    v_add_f32_e32 v2, v2, v4
-; CI-SAFE-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; CI-SAFE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; CI-SAFE-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; CI-SAFE-NEXT:    v_or_b32_e32 v2, v2, v3
-; CI-SAFE-NEXT:    v_xor_b32_e32 v2, 0x80008000, v2
-; CI-SAFE-NEXT:    v_cvt_f32_f16_e32 v3, v2
-; CI-SAFE-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; CI-SAFE-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; CI-SAFE-NEXT:    v_cndmask_b32_e32 v0, 2.0, v3, vcc
-; CI-SAFE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; CI-SAFE-NEXT:    v_cndmask_b32_e32 v1, 2.0, v2, vcc
-; CI-SAFE-NEXT:    s_setpc_b64 s[30:31]
+; CI-LABEL: select_fneg_posk_src_fmad_v2f16:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; CI-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; CI-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; CI-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; CI-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; CI-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; CI-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; CI-NEXT:    v_mul_f32_e32 v3, 4.0, v3
+; CI-NEXT:    v_add_f32_e32 v3, v3, v5
+; CI-NEXT:    v_mul_f32_e32 v2, 4.0, v2
+; CI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; CI-NEXT:    v_add_f32_e32 v2, v2, v4
+; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; CI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; CI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; CI-NEXT:    v_or_b32_e32 v2, v2, v3
+; CI-NEXT:    v_xor_b32_e32 v2, 0x80008000, v2
+; CI-NEXT:    v_cvt_f32_f16_e32 v3, v2
+; CI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; CI-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; CI-NEXT:    v_cndmask_b32_e32 v0, 2.0, v3, vcc
+; CI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; CI-NEXT:    v_cndmask_b32_e32 v1, 2.0, v2, vcc
+; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SAFE-LABEL: select_fneg_posk_src_fmad_v2f16:
 ; VI-SAFE:       ; %bb.0:
@@ -4793,27 +4877,6 @@ define <2 x half> @select_fneg_posk_src_fmad_v2f16(<2 x i32> %c, <2 x half> %x,
 ; GFX11-SAFE-FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
 ; GFX11-SAFE-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
-; CI-NSZ-LABEL: select_fneg_posk_src_fmad_v2f16:
-; CI-NSZ:       ; %bb.0:
-; CI-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CI-NSZ-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; CI-NSZ-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; CI-NSZ-NEXT:    v_cvt_f16_f32_e32 v4, v4
-; CI-NSZ-NEXT:    v_cvt_f16_f32_e32 v5, v5
-; CI-NSZ-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; CI-NSZ-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; CI-NSZ-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; CI-NSZ-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; CI-NSZ-NEXT:    v_mul_f32_e32 v2, -4.0, v2
-; CI-NSZ-NEXT:    v_mul_f32_e32 v3, -4.0, v3
-; CI-NSZ-NEXT:    v_sub_f32_e32 v2, v2, v4
-; CI-NSZ-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; CI-NSZ-NEXT:    v_sub_f32_e32 v3, v3, v5
-; CI-NSZ-NEXT:    v_cndmask_b32_e32 v0, 2.0, v2, vcc
-; CI-NSZ-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; CI-NSZ-NEXT:    v_cndmask_b32_e32 v1, 2.0, v3, vcc
-; CI-NSZ-NEXT:    s_setpc_b64 s[30:31]
-;
 ; VI-NSZ-LABEL: select_fneg_posk_src_fmad_v2f16:
 ; VI-NSZ:       ; %bb.0:
 ; VI-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4873,6 +4936,112 @@ define <2 x half> @select_fneg_posk_src_fmad_v2f16(<2 x i32> %c, <2 x half> %x,
   ret <2 x half> %select
 }
 
+define <2 x half> @select_fneg_posk_src_fmad_v2f16_nsz(<2 x i32> %c, <2 x half> %x, <2 x half> %z) {
+; CI-LABEL: select_fneg_posk_src_fmad_v2f16_nsz:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; CI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; CI-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; CI-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; CI-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; CI-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; CI-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; CI-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; CI-NEXT:    v_mul_f32_e32 v2, -4.0, v2
+; CI-NEXT:    v_mul_f32_e32 v3, -4.0, v3
+; CI-NEXT:    v_sub_f32_e32 v2, v2, v4
+; CI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; CI-NEXT:    v_sub_f32_e32 v3, v3, v5
+; CI-NEXT:    v_cndmask_b32_e32 v0, 2.0, v2, vcc
+; CI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; CI-NEXT:    v_cndmask_b32_e32 v1, 2.0, v3, vcc
+; CI-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: select_fneg_posk_src_fmad_v2f16_nsz:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v3
+; VI-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
+; VI-NEXT:    v_fma_f16 v1, v4, -4.0, -v1
+; VI-NEXT:    v_fma_f16 v2, v2, -4.0, -v3
+; VI-NEXT:    v_mov_b32_e32 v3, 0x4000
+; VI-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v0
+; VI-NEXT:    v_cndmask_b32_e64 v0, v3, v2, s[4:5]
+; VI-NEXT:    v_cndmask_b32_sdwa v1, v3, v1, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: select_fneg_posk_src_fmad_v2f16_nsz:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; GFX9-NEXT:    v_pk_fma_f16 v1, v2, -4.0, v3 op_sel_hi:[1,0,1] neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0x4000
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v0
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, v1, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SAFE-TRUE16-LABEL: select_fneg_posk_src_fmad_v2f16_nsz:
+; GFX11-SAFE-TRUE16:       ; %bb.0:
+; GFX11-SAFE-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-SAFE-TRUE16-NEXT:    v_pk_fma_f16 v0, v2, -4.0, v3 op_sel_hi:[1,0,1] neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GFX11-SAFE-TRUE16-NEXT:    v_cmp_eq_u32_e64 s0, 0, v1
+; GFX11-SAFE-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SAFE-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0x4000, v0.l, vcc_lo
+; GFX11-SAFE-TRUE16-NEXT:    v_cndmask_b16 v0.h, 0x4000, v0.h, s0
+; GFX11-SAFE-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SAFE-FAKE16-LABEL: select_fneg_posk_src_fmad_v2f16_nsz:
+; GFX11-SAFE-FAKE16:       ; %bb.0:
+; GFX11-SAFE-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-FAKE16-NEXT:    v_pk_fma_f16 v2, v2, -4.0, v3 op_sel_hi:[1,0,1] neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GFX11-SAFE-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-SAFE-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-SAFE-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; GFX11-SAFE-FAKE16-NEXT:    v_cndmask_b32_e32 v0, 0x4000, v2, vcc_lo
+; GFX11-SAFE-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-SAFE-FAKE16-NEXT:    v_cndmask_b32_e32 v1, 0x4000, v3, vcc_lo
+; GFX11-SAFE-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SAFE-FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-SAFE-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-TRUE16-LABEL: select_fneg_posk_src_fmad_v2f16_nsz:
+; GFX11-NSZ-TRUE16:       ; %bb.0:
+; GFX11-NSZ-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-NSZ-TRUE16-NEXT:    v_pk_fma_f16 v0, v2, -4.0, v3 op_sel_hi:[1,0,1] neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GFX11-NSZ-TRUE16-NEXT:    v_cmp_eq_u32_e64 s0, 0, v1
+; GFX11-NSZ-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NSZ-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0x4000, v0.l, vcc_lo
+; GFX11-NSZ-TRUE16-NEXT:    v_cndmask_b16 v0.h, 0x4000, v0.h, s0
+; GFX11-NSZ-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-FAKE16-LABEL: select_fneg_posk_src_fmad_v2f16_nsz:
+; GFX11-NSZ-FAKE16:       ; %bb.0:
+; GFX11-NSZ-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-FAKE16-NEXT:    v_pk_fma_f16 v2, v2, -4.0, v3 op_sel_hi:[1,0,1] neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GFX11-NSZ-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-NSZ-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NSZ-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; GFX11-NSZ-FAKE16-NEXT:    v_cndmask_b32_e32 v0, 0x4000, v2, vcc_lo
+; GFX11-NSZ-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-NSZ-FAKE16-NEXT:    v_cndmask_b32_e32 v1, 0x4000, v3, vcc_lo
+; GFX11-NSZ-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NSZ-FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-NSZ-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+  %cmp = icmp eq <2 x i32> %c, zeroinitializer
+  %fmad = call nsz <2 x half> @llvm.fmuladd.v2f16(<2 x half> %x, <2 x half> <half 4.0, half 4.0>, <2 x half> %z)
+  %fneg = fneg <2 x half> %fmad
+  %select = select <2 x i1> %cmp, <2 x half> %fneg, <2 x half> <half 2.0, half 2.0>
+  ret <2 x half> %select
+}
+
 declare <2 x half> @llvm.fabs.v2f16(<2 x half>) #0
 declare <2 x half> @llvm.fma.v2f16(<2 x half>, <2 x half>, <2 x half>) #0
 declare <2 x half> @llvm.fmuladd.v2f16(<2 x half>, <2 x half>, <2 x half>) #0
diff --git a/llvm/test/CodeGen/AMDGPU/v_mac.ll b/llvm/test/CodeGen/AMDGPU/v_mac.ll
index c128715..f5dc824 100644
--- a/llvm/test/CodeGen/AMDGPU/v_mac.ll
+++ b/llvm/test/CodeGen/AMDGPU/v_mac.ll
@@ -116,7 +116,7 @@ entry:
 ; GCN-LABEL: {{^}}nsz_mad_sub0_src0:
 ; GCN-NOT: v_mac_f32
 ; GCN: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}}
-define amdgpu_kernel void @nsz_mad_sub0_src0(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+define amdgpu_kernel void @nsz_mad_sub0_src0(ptr addrspace(1) %out, ptr addrspace(1) %in) {
 entry:
   %b_ptr = getelementptr float, ptr addrspace(1) %in, i32 1
   %c_ptr = getelementptr float, ptr addrspace(1) %in, i32 2
@@ -125,7 +125,7 @@ entry:
   %b = load float, ptr addrspace(1) %b_ptr
   %c = load float, ptr addrspace(1) %c_ptr
 
-  %neg_a = fsub float 0.0, %a
+  %neg_a = fsub nsz float 0.0, %a
   %tmp0 = fmul float %neg_a, %b
   %tmp1 = fadd float %tmp0, %c
 
@@ -176,7 +176,7 @@ entry:
 ; GCN-LABEL: {{^}}nsz_mad_sub0_src1:
 ; GCN-NOT: v_mac_f32
 ; GCN: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}}
-define amdgpu_kernel void @nsz_mad_sub0_src1(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+define amdgpu_kernel void @nsz_mad_sub0_src1(ptr addrspace(1) %out, ptr addrspace(1) %in) {
 entry:
   %b_ptr = getelementptr float, ptr addrspace(1) %in, i32 1
   %c_ptr = getelementptr float, ptr addrspace(1) %in, i32 2
@@ -185,7 +185,7 @@ entry:
   %b = load float, ptr addrspace(1) %b_ptr
   %c = load float, ptr addrspace(1) %c_ptr
 
-  %neg_b = fsub float 0.0, %b
+  %neg_b = fsub nsz float 0.0, %b
   %tmp0 = fmul float %a, %neg_b
   %tmp1 = fadd float %tmp0, %c
 
@@ -310,6 +310,5 @@ define float @v_mac_f32_dynamic_ftz(float %a, float %b, float %c) "denormal-fp-m
 declare i32 @llvm.amdgcn.workitem.id.x() #2
 
 attributes #0 = { nounwind "no-signed-zeros-fp-math"="false" }
-attributes #1 = { nounwind "no-signed-zeros-fp-math"="true" }
 attributes #2 = { nounwind readnone }
 attributes #3 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/v_mac_f16.ll b/llvm/test/CodeGen/AMDGPU/v_mac_f16.ll
index bcc60b0..8da6f23 100644
--- a/llvm/test/CodeGen/AMDGPU/v_mac_f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/v_mac_f16.ll
@@ -236,7 +236,7 @@ entry:
   %b.val = load half, ptr addrspace(1) %b
   %c.val = load half, ptr addrspace(1) %c
 
-  %a.neg = fsub half 0.0, %a.val
+  %a.neg = fsub nsz half 0.0, %a.val
   %t.val = fmul half %a.neg, %b.val
   %r.val = fadd half %t.val, %c.val
 
@@ -263,7 +263,7 @@ entry:
   %b.val = load half, ptr addrspace(1) %b
   %c.val = load half, ptr addrspace(1) %c
 
-  %b.neg = fsub half 0.0, %b.val
+  %b.neg = fsub nsz half 0.0, %b.val
   %t.val = fmul half %a.val, %b.neg
   %r.val = fadd half %t.val, %c.val
 
@@ -290,7 +290,7 @@ entry:
   %b.val = load half, ptr addrspace(1) %b
   %c.val = load half, ptr addrspace(1) %c
 
-  %c.neg = fsub half 0.0, %c.val
+  %c.neg = fsub nsz half 0.0, %c.val
   %t.val = fmul half %a.val, %b.val
   %r.val = fadd half %t.val, %c.neg
 
@@ -601,7 +601,7 @@ entry:
   %b.val = load <2 x half>, ptr addrspace(1) %b
   %c.val = load <2 x half>, ptr addrspace(1) %c
 
-  %a.neg = fsub <2 x half> <half 0.0, half 0.0>, %a.val
+  %a.neg = fsub nsz <2 x half> <half 0.0, half 0.0>, %a.val
   %t.val = fmul <2 x half> %a.neg, %b.val
   %r.val = fadd <2 x half> %t.val, %c.val
 
@@ -634,7 +634,7 @@ entry:
   %b.val = load <2 x half>, ptr addrspace(1) %b
   %c.val = load <2 x half>, ptr addrspace(1) %c
 
-  %b.neg = fsub <2 x half> <half 0.0, half 0.0>, %b.val
+  %b.neg = fsub nsz <2 x half> <half 0.0, half 0.0>, %b.val
   %t.val = fmul <2 x half> %a.val, %b.neg
   %r.val = fadd <2 x half> %t.val, %c.val
 
@@ -667,7 +667,7 @@ entry:
   %b.val = load <2 x half>, ptr addrspace(1) %b
   %c.val = load <2 x half>, ptr addrspace(1) %c
 
-  %c.neg = fsub <2 x half> <half 0.0, half 0.0>, %c.val
+  %c.neg = fsub nsz <2 x half> <half 0.0, half 0.0>, %c.val
   %t.val = fmul <2 x half> %a.val, %b.val
   %r.val = fadd <2 x half> %t.val, %c.neg
 
@@ -678,5 +678,5 @@ entry:
 declare void @llvm.amdgcn.s.barrier() #2
 
 attributes #0 = { nounwind "no-signed-zeros-fp-math"="false" "denormal-fp-math"="preserve-sign,preserve-sign" }
-attributes #1 = { nounwind "no-signed-zeros-fp-math"="true" "denormal-fp-math"="preserve-sign,preserve-sign" }
+attributes #1 = { nounwind "denormal-fp-math"="preserve-sign,preserve-sign" }
 attributes #2 = { nounwind convergent }
diff --git a/llvm/test/CodeGen/ARM/build-attributes.ll b/llvm/test/CodeGen/ARM/build-attributes.ll
index 68844ae..306a4a3 100644
--- a/llvm/test/CodeGen/ARM/build-attributes.ll
+++ b/llvm/test/CodeGen/ARM/build-attributes.ll
@@ -3,23 +3,16 @@
 
 ; RUN: llc < %s -mtriple=thumbv5-linux-gnueabi -mcpu=xscale -mattr=+strict-align | FileCheck %s --check-prefix=XSCALE
 ; RUN: llc < %s -mtriple=armv6-linux-gnueabi -mattr=+strict-align | FileCheck %s --check-prefix=V6
-; RUN: llc < %s -mtriple=armv6-linux-gnueabi -mattr=+strict-align  -enable-unsafe-fp-math -frame-pointer=all -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=V6-FAST
 ; RUN: llc < %s -mtriple=armv6-linux-gnueabi -mattr=+strict-align -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING
 ; RUN: llc < %s -mtriple=thumbv6m-linux-gnueabi -mattr=+strict-align | FileCheck %s --check-prefix=V6M
-; RUN: llc < %s -mtriple=thumbv6m-linux-gnueabi -mattr=+strict-align  -enable-unsafe-fp-math -frame-pointer=all -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=V6M-FAST
 ; RUN: llc < %s -mtriple=thumbv6sm-linux-gnueabi -mattr=+strict-align | FileCheck %s --check-prefix=V6M
-; RUN: llc < %s -mtriple=thumbv6sm-linux-gnueabi -mattr=+strict-align -enable-unsafe-fp-math -frame-pointer=all -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=V6M-FAST
 ; RUN: llc < %s -mtriple=armv6-linux-gnueabi -mcpu=arm1156t2f-s -mattr=+strict-align | FileCheck %s --check-prefix=ARM1156T2F-S
-; RUN: llc < %s -mtriple=armv6-linux-gnueabi -mcpu=arm1156t2f-s -mattr=+strict-align  -enable-unsafe-fp-math -frame-pointer=all -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast  | FileCheck %s --check-prefix=ARM1156T2F-S-FAST
 ; RUN: llc < %s -mtriple=armv6-linux-gnueabi -mcpu=arm1156t2f-s -mattr=+strict-align -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING
 ; RUN: llc < %s -mtriple=thumbv7m-linux-gnueabi | FileCheck %s --check-prefix=V7M
-; RUN: llc < %s -mtriple=thumbv7m-linux-gnueabi  -enable-unsafe-fp-math -frame-pointer=all -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=V7M-FAST
 ; RUN: llc < %s -mtriple=thumbv7m-linux-gnueabi -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING
 ; RUN: llc < %s -mtriple=armv7-linux-gnueabi | FileCheck %s --check-prefix=V7
 ; RUN: llc < %s -mtriple=armv7-linux-gnueabi -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING
-; RUN: llc < %s -mtriple=armv7-linux-gnueabi  -enable-unsafe-fp-math -frame-pointer=all -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=V7-FAST
 ; RUN: llc < %s -mtriple=armv8-linux-gnueabi | FileCheck %s --check-prefix=V8
-; RUN: llc < %s -mtriple=armv8-linux-gnueabi  -enable-unsafe-fp-math -frame-pointer=all -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=V8-FAST
 ; RUN: llc < %s -mtriple=armv8-linux-gnueabi -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING
 ; RUN: llc < %s -mtriple=thumbv8-linux-gnueabi | FileCheck %s --check-prefix=Vt8
 ; RUN: llc < %s -mtriple=thumbv8-linux-gnueabi -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING
@@ -31,35 +24,24 @@
 ; RUN: llc < %s -mtriple=thumbv8m.main-linux-gnueabi | FileCheck %s --check-prefix=V8MMAINLINE
 ; RUN: llc < %s -mtriple=thumbv8m.main-linux-gnueabi -mattr=+dsp | FileCheck %s --check-prefix=V8MMAINLINE_DSP
 ; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a5 | FileCheck %s --check-prefix=CORTEX-A5-DEFAULT
-; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a5  -enable-unsafe-fp-math -frame-pointer=all -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=CORTEX-A5-DEFAULT-FAST
 ; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a5 -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING
 ; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a5 -mattr=-neon,-d32 | FileCheck %s --check-prefix=CORTEX-A5-NONEON
 ; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a5 -mattr=-vfp2sp | FileCheck %s --check-prefix=CORTEX-A5-NOFPU
-; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a5 -mattr=-vfp2sp  -enable-unsafe-fp-math -frame-pointer=all -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=CORTEX-A5-NOFPU-FAST
 ; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a8 -float-abi=soft | FileCheck %s --check-prefix=CORTEX-A8-SOFT
-; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a8 -float-abi=soft  -enable-unsafe-fp-math -frame-pointer=all -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=CORTEX-A8-SOFT-FAST
 ; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a8 -float-abi=hard | FileCheck %s --check-prefix=CORTEX-A8-HARD
-; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a8 -float-abi=hard  -enable-unsafe-fp-math -frame-pointer=all -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=CORTEX-A8-HARD-FAST
 ; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a8 -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING
 ; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a8 -float-abi=soft | FileCheck %s --check-prefix=CORTEX-A8-SOFT
 ; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a9 -float-abi=soft | FileCheck %s --check-prefix=CORTEX-A9-SOFT
-; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a9 -float-abi=soft  -enable-unsafe-fp-math -frame-pointer=all -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=CORTEX-A9-SOFT-FAST
 ; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a9 -float-abi=hard | FileCheck %s --check-prefix=CORTEX-A9-HARD
-; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a9 -float-abi=hard  -enable-unsafe-fp-math -frame-pointer=all -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=CORTEX-A9-HARD-FAST
 ; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a9 -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING
 ; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a12 | FileCheck %s --check-prefix=CORTEX-A12-DEFAULT
 ; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a9 -float-abi=soft | FileCheck %s --check-prefix=CORTEX-A9-SOFT
-; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a12  -enable-unsafe-fp-math -frame-pointer=all -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=CORTEX-A12-DEFAULT-FAST
 ; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a12 -mattr=-vfp2sp | FileCheck %s --check-prefix=CORTEX-A12-NOFPU
-; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a12 -mattr=-vfp2sp  -enable-unsafe-fp-math -frame-pointer=all -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=CORTEX-A12-NOFPU-FAST
 ; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a12 -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING
 ; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a15 | FileCheck %s --check-prefix=CORTEX-A15
-; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a15  -enable-unsafe-fp-math -frame-pointer=all -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=CORTEX-A15-FAST
 ; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a15 -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING
 ; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a17 | FileCheck %s --check-prefix=CORTEX-A17-DEFAULT
-; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a17  -enable-unsafe-fp-math -frame-pointer=all -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=CORTEX-A17-FAST
 ; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a17 -mattr=-vfp2sp | FileCheck %s --check-prefix=CORTEX-A17-NOFPU
-; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a17 -mattr=-vfp2sp  -enable-unsafe-fp-math -frame-pointer=all -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=CORTEX-A17-NOFPU-FAST
 
 ; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a15 -enable-no-trapping-fp-math | FileCheck %s --check-prefix=NO-TRAPPING-MATH
 ; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a15 -denormal-fp-math=ieee | FileCheck %s --check-prefix=DENORMAL-IEEE
@@ -74,37 +56,26 @@
 
 ; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a17 -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING
 ; RUN: llc < %s -mtriple=thumbv6m-linux-gnueabi -mcpu=cortex-m0 | FileCheck %s --check-prefix=CORTEX-M0
-; RUN: llc < %s -mtriple=thumbv6m-linux-gnueabi -mcpu=cortex-m0 -enable-unsafe-fp-math -frame-pointer=all -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=CORTEX-M0-FAST
 ; RUN: llc < %s -mtriple=thumbv6m-linux-gnueabi -mcpu=cortex-m0 -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING
 ; RUN: llc < %s -mtriple=thumbv6m-linux-gnueabi -mcpu=cortex-m0plus | FileCheck %s --check-prefix=CORTEX-M0PLUS
-; RUN: llc < %s -mtriple=thumbv6m-linux-gnueabi -mcpu=cortex-m0plus -enable-unsafe-fp-math -frame-pointer=all -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=CORTEX-M0PLUS-FAST
 ; RUN: llc < %s -mtriple=thumbv6m-linux-gnueabi -mcpu=cortex-m0plus -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING
 ; RUN: llc < %s -mtriple=thumbv6m-linux-gnueabi -mcpu=cortex-m1 | FileCheck %s --check-prefix=CORTEX-M1
-; RUN: llc < %s -mtriple=thumbv6m-linux-gnueabi -mcpu=cortex-m1 -mattr=+strict-align  -enable-unsafe-fp-math -frame-pointer=all -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=CORTEX-M1-FAST
 ; RUN: llc < %s -mtriple=thumbv6m-linux-gnueabi -mcpu=cortex-m1 -mattr=+strict-align -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING
 ; RUN: llc < %s -mtriple=thumbv6m-linux-gnueabi -mcpu=sc000 -mattr=+strict-align | FileCheck %s --check-prefix=SC000
-; RUN: llc < %s -mtriple=thumbv6m-linux-gnueabi -mcpu=sc000 -mattr=+strict-align  -enable-unsafe-fp-math -frame-pointer=all -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=SC000-FAST
 ; RUN: llc < %s -mtriple=thumbv6m-linux-gnueabi -mcpu=sc000 -mattr=+strict-align -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING
 ; RUN: llc < %s -mtriple=thumbv7m-linux-gnueabi -mcpu=cortex-m3 | FileCheck %s --check-prefix=CORTEX-M3
-; RUN: llc < %s -mtriple=thumbv7m-linux-gnueabi -mcpu=cortex-m3  -enable-unsafe-fp-math -frame-pointer=all -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=CORTEX-M3-FAST
 ; RUN: llc < %s -mtriple=thumbv7m-linux-gnueabi -mcpu=cortex-m3 -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING
 ; RUN: llc < %s -mtriple=thumbv7m-linux-gnueabi -mcpu=sc300 | FileCheck %s --check-prefix=SC300
-; RUN: llc < %s -mtriple=thumbv7m-linux-gnueabi -mcpu=sc300  -enable-unsafe-fp-math -frame-pointer=all -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=SC300-FAST
 ; RUN: llc < %s -mtriple=thumbv7m-linux-gnueabi -mcpu=sc300 -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING
 ; RUN: llc < %s -mtriple=thumbv7m-linux-gnueabi -mcpu=cortex-m4 -float-abi=soft | FileCheck %s --check-prefix=CORTEX-M4-SOFT
-; RUN: llc < %s -mtriple=thumbv7m-linux-gnueabi -mcpu=cortex-m4 -float-abi=soft  -enable-unsafe-fp-math -frame-pointer=all -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=CORTEX-M4-SOFT-FAST
 ; RUN: llc < %s -mtriple=thumbv7m-linux-gnueabi -mcpu=cortex-m4 -float-abi=hard | FileCheck %s --check-prefix=CORTEX-M4-HARD
-; RUN: llc < %s -mtriple=thumbv7m-linux-gnueabi -mcpu=cortex-m4 -float-abi=hard  -enable-unsafe-fp-math -frame-pointer=all -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=CORTEX-M4-HARD-FAST
 ; RUN: llc < %s -mtriple=thumbv7m-linux-gnueabi -mcpu=cortex-m4 -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING
 ; RUN: llc < %s -mtriple=thumbv7em-linux-gnueabi -mcpu=cortex-m7 -mattr=-vfp2sp | FileCheck %s --check-prefix=CORTEX-M7 --check-prefix=CORTEX-M7-SOFT
-; RUN: llc < %s -mtriple=thumbv7em-linux-gnueabi -mcpu=cortex-m7 -mattr=-vfp2sp  -enable-unsafe-fp-math -frame-pointer=all -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=CORTEX-M7-NOFPU-FAST
 ; RUN: llc < %s -mtriple=thumbv7em-linux-gnueabi -mcpu=cortex-m7 -mattr=-fp64 | FileCheck %s --check-prefix=CORTEX-M7 --check-prefix=CORTEX-M7-SINGLE
-; RUN: llc < %s -mtriple=thumbv7em-linux-gnueabi -mcpu=cortex-m7 -mattr=-fp64  -enable-unsafe-fp-math -frame-pointer=all -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=CORTEX-M7-FAST
 ; RUN: llc < %s -mtriple=thumbv7em-linux-gnueabi -mcpu=cortex-m7 | FileCheck %s --check-prefix=CORTEX-M7-DOUBLE
 ; RUN: llc < %s -mtriple=thumbv7em-linux-gnueabi -mcpu=cortex-m7 -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING
 ; RUN: llc < %s -mtriple=thumbv8-linux-gnueabi -mcpu=cortex-m23 | FileCheck %s --check-prefix=CORTEX-M23
 ; RUN: llc < %s -mtriple=thumbv8-linux-gnueabi -mcpu=cortex-m33 | FileCheck %s --check-prefix=CORTEX-M33
-; RUN: llc < %s -mtriple=thumbv8-linux-gnueabi -mcpu=cortex-m33 -enable-unsafe-fp-math -frame-pointer=all -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=CORTEX-M33-FAST
 ; RUN: llc < %s -mtriple=thumbv8-linux-gnueabi -mcpu=cortex-m33 -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING
 
 ; RUN: llc < %s -mtriple=thumbv8-linux-gnueabi -mcpu=cortex-m35p | FileCheck %s --check-prefix=CORTEX-M35P
@@ -113,49 +84,34 @@
 ; RUN: llc < %s -mtriple=armv7r-linux-gnueabi -mcpu=cortex-r4 | FileCheck %s --check-prefix=CORTEX-R4
 ; RUN: llc < %s -mtriple=armv7r-linux-gnueabi -mcpu=cortex-r4f | FileCheck %s --check-prefix=CORTEX-R4F
 ; RUN: llc < %s -mtriple=armv7r-linux-gnueabi -mcpu=cortex-r5 | FileCheck %s --check-prefix=CORTEX-R5
-; RUN: llc < %s -mtriple=armv7r-linux-gnueabi -mcpu=cortex-r5  -enable-unsafe-fp-math -frame-pointer=all -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=CORTEX-R5-FAST
 ; RUN: llc < %s -mtriple=armv7r-linux-gnueabi -mcpu=cortex-r5 -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING
 ; RUN: llc < %s -mtriple=armv7r-linux-gnueabi -mcpu=cortex-r7 | FileCheck %s --check-prefix=CORTEX-R7
-; RUN: llc < %s -mtriple=armv7r-linux-gnueabi -mcpu=cortex-r7  -enable-unsafe-fp-math -frame-pointer=all -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=CORTEX-R7-FAST
 ; RUN: llc < %s -mtriple=armv7r-linux-gnueabi -mcpu=cortex-r7 -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING
 ; RUN: llc < %s -mtriple=armv7r-linux-gnueabi -mcpu=cortex-r8 | FileCheck %s --check-prefix=CORTEX-R8
-; RUN: llc < %s -mtriple=armv7r-linux-gnueabi -mcpu=cortex-r8  -enable-unsafe-fp-math -frame-pointer=all -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=CORTEX-R8-FAST
 ; RUN: llc < %s -mtriple=armv7r-linux-gnueabi -mcpu=cortex-r8 -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING
 ; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mcpu=cortex-a32 | FileCheck %s --check-prefix=CORTEX-A32
-; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mcpu=cortex-a32  -enable-unsafe-fp-math -frame-pointer=all -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=CORTEX-A32-FAST
 ; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mcpu=cortex-a32 -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING
 ; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mcpu=cortex-a35 | FileCheck %s --check-prefix=CORTEX-A35
-; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mcpu=cortex-a35  -enable-unsafe-fp-math -frame-pointer=all -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=CORTEX-A35-FAST
 ; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mcpu=cortex-a35 -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING
 ; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mcpu=cortex-a53 | FileCheck %s --check-prefix=CORTEX-A53
-; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mcpu=cortex-a53  -enable-unsafe-fp-math -frame-pointer=all -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=CORTEX-A53-FAST
 ; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mcpu=cortex-a53 -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING
 ; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mcpu=cortex-a57 | FileCheck %s --check-prefix=CORTEX-A57
-; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mcpu=cortex-a57  -enable-unsafe-fp-math -frame-pointer=all -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=CORTEX-A57-FAST
 ; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mcpu=cortex-a57 -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING
 ; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mcpu=cortex-a72 | FileCheck %s --check-prefix=CORTEX-A72
-; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mcpu=cortex-a72  -enable-unsafe-fp-math -frame-pointer=all -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=CORTEX-A72-FAST
 ; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mcpu=cortex-a72 -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING
 ; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mcpu=cortex-a73 | FileCheck %s --check-prefix=CORTEX-A73
 ; RUN: llc < %s -mtriple=armv8.1a-linux-gnueabi | FileCheck %s --check-prefix=GENERIC-ARMV8_1-A
 ; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mcpu=exynos-m3 | FileCheck %s --check-prefix=EXYNOS-M3
-; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mcpu=exynos-m3  -enable-unsafe-fp-math -frame-pointer=all -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=EXYNOS-FAST
 ; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mcpu=exynos-m3 -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING
 ; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mcpu=exynos-m4 | FileCheck %s --check-prefix=EXYNOS-M4
-; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mcpu=exynos-m4  -enable-unsafe-fp-math -frame-pointer=all -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=EXYNOS-FAST
 ; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mcpu=exynos-m4 -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING
 ; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mcpu=exynos-m5 | FileCheck %s --check-prefix=EXYNOS-M5
-; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mcpu=exynos-m5  -enable-unsafe-fp-math -frame-pointer=all -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=EXYNOS-FAST
 ; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mcpu=exynos-m5 -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING
-; RUN: llc < %s -mtriple=armv8.1a-linux-gnueabi  -enable-unsafe-fp-math -frame-pointer=all -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=GENERIC-ARMV8_1-A-FAST
 ; RUN: llc < %s -mtriple=armv8.1a-linux-gnueabi -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING
 ; RUN: llc < %s -mtriple=armv7-none-linux-gnueabi -mcpu=cortex-a7 | FileCheck %s  --check-prefix=CORTEX-A7-CHECK
-; RUN: llc < %s -mtriple=armv7-none-linux-gnueabi -mcpu=cortex-a7  -enable-unsafe-fp-math -frame-pointer=all -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s  --check-prefix=CORTEX-A7-CHECK-FAST
 ; RUN: llc < %s -mtriple=armv7-none-linux-gnueabi -mcpu=cortex-a7 -mattr=-vfp2sp,-vfp3,-vfp4,-neon,-fp16 | FileCheck %s --check-prefix=CORTEX-A7-NOFPU
-; RUN: llc < %s -mtriple=armv7-none-linux-gnueabi -mcpu=cortex-a7 -mattr=-vfp2sp,-vfp3,-vfp4,-neon,-fp16  -enable-unsafe-fp-math -frame-pointer=all -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=CORTEX-A7-NOFPU-FAST
 ; RUN: llc < %s -mtriple=armv7-none-linux-gnueabi -mcpu=cortex-a7 -mattr=+vfp4,-neon | FileCheck %s --check-prefix=CORTEX-A7-FPUV4
 ; RUN: llc < %s -mtriple=armv7-none-linux-gnueabi -mcpu=cortex-a7 -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING
-; RUN: llc < %s -mtriple=armv7-none-linux-gnueabi -mcpu=cortex-a7 -mattr=+vfp4,-neon  -enable-unsafe-fp-math -frame-pointer=all -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=CORTEX-A7-FPUV4-FAST
 ; RUN: llc < %s -mtriple=armv7-none-linux-gnueabi -mcpu=cortex-a7 -mattr=+vfp4,,-d32,-neon | FileCheck %s --check-prefix=CORTEX-A7-FPUV4
 ; RUN: llc < %s -mtriple=arm-none-linux-gnueabi -mattr=+strict-align -relocation-model=pic | FileCheck %s --check-prefix=RELOC-PIC
 ; RUN: llc < %s -mtriple=arm-none-linux-gnueabi -mattr=+strict-align -relocation-model=static | FileCheck %s --check-prefix=RELOC-OTHER
@@ -278,15 +234,6 @@
 ; V6-NOT:   .eabi_attribute 28
 ; V6:    .eabi_attribute 38, 1
 
-; V6-FAST-NOT:   .eabi_attribute 19
-;; Despite the V6 CPU having no FPU by default, we chose to flush to
-;; positive zero here. There's no hardware support doing this, but the
-;; fast maths software library might.
-; V6-FAST-NOT:   .eabi_attribute 20
-; V6-FAST-NOT:   .eabi_attribute 21
-; V6-FAST-NOT:   .eabi_attribute 22
-; V6-FAST:   .eabi_attribute 23, 1
-
 ;; We emit 6, 12 for both v6-M and v6S-M, technically this is incorrect for
 ;; V6-M, however we don't model the OS extension so this is fine.
 ; V6M:  .eabi_attribute 6, 12
@@ -312,14 +259,6 @@
 ; V6M-NOT:  .eabi_attribute 28
 ; V6M:  .eabi_attribute 38, 1
 
-; V6M-FAST-NOT:   .eabi_attribute 19
-;; Despite the V6M CPU having no FPU by default, we chose to flush to
-;; positive zero here. There's no hardware support doing this, but the
-;; fast maths software library might.
-; V6M-FAST-NOT:  .eabi_attribute 20
-; V6M-FAST-NOT:   .eabi_attribute 21
-; V6M-FAST-NOT:   .eabi_attribute 22
-; V6M-FAST:   .eabi_attribute 23, 1
 
 ; ARM1156T2F-S: .cpu arm1156t2f-s
 ; ARM1156T2F-S: .eabi_attribute 6, 8
@@ -342,14 +281,6 @@
 ; ARM1156T2F-S-NOT: .eabi_attribute 28
 ; ARM1156T2F-S: .eabi_attribute 38, 1
 
-; ARM1156T2F-S-FAST-NOT:   .eabi_attribute 19
-;; V6 cores default to flush to positive zero (value 0). Note that value 2 is also equally
-;; valid for this core, it's an implementation defined question as to which of 0 and 2 you
-;; select. LLVM historically picks 0.
-; ARM1156T2F-S-FAST-NOT: .eabi_attribute 20
-; ARM1156T2F-S-FAST-NOT:   .eabi_attribute 21
-; ARM1156T2F-S-FAST-NOT:   .eabi_attribute 22
-; ARM1156T2F-S-FAST:   .eabi_attribute 23, 1
 
 ; V7M:  .eabi_attribute 6, 10
 ; V7M:  .eabi_attribute 7, 77
@@ -374,15 +305,6 @@
 ; V7M-NOT:  .eabi_attribute 28
 ; V7M:  .eabi_attribute 38, 1
 
-; V7M-FAST-NOT:   .eabi_attribute 19
-;; Despite the V7M CPU having no FPU by default, we chose to flush
-;; preserving sign. This matches what the hardware would do in the
-;; architecture revision were to exist on the current target.
-; V7M-FAST:  .eabi_attribute 20, 2
-; V7M-FAST-NOT:   .eabi_attribute 21
-; V7M-FAST-NOT:   .eabi_attribute 22
-; V7M-FAST:   .eabi_attribute 23, 1
-
 ; V7:      .syntax unified
 ; V7: .eabi_attribute 6, 10
 ; V7-NOT: .eabi_attribute 27
@@ -401,13 +323,6 @@
 ; V7-NOT: .eabi_attribute 28
 ; V7: .eabi_attribute 38, 1
 
-; V7-FAST-NOT:   .eabi_attribute 19
-;; The default CPU does have an FPU and it must be VFPv3 or better, so it flushes
-;; denormals to zero preserving the sign.
-; V7-FAST: .eabi_attribute 20, 2
-; V7-FAST-NOT:   .eabi_attribute 21
-; V7-FAST-NOT:   .eabi_attribute 22
-; V7-FAST:   .eabi_attribute 23, 1
 
 ; V7VE:      .syntax unified
 ; V7VE: .eabi_attribute 6, 10   @ Tag_CPU_arch
@@ -435,12 +350,6 @@
 ; V8-NOT: .eabi_attribute 22
 ; V8: .eabi_attribute 23, 3
 
-; V8-FAST-NOT:   .eabi_attribute 19
-;; The default does have an FPU, and for V8-A, it flushes preserving sign.
-; V8-FAST: .eabi_attribute 20, 2
-; V8-FAST-NOT: .eabi_attribute 21
-; V8-FAST-NOT: .eabi_attribute 22
-; V8-FAST: .eabi_attribute 23, 1
 
 ; Vt8:     .syntax unified
 ; Vt8: .eabi_attribute 6, 14
@@ -552,15 +461,11 @@
 ;; We default to IEEE 754 compliance
 ; CORTEX-A7-CHECK: .eabi_attribute      20, 1
 ;; The A7 has VFPv3 support by default, so flush preserving sign.
-; CORTEX-A7-CHECK-FAST: .eabi_attribute 20, 2
 ; CORTEX-A7-NOFPU: .eabi_attribute      20, 1
 ;; Despite there being no FPU, we chose to flush to zero preserving
 ;; sign. This matches what the hardware would do for this architecture
 ;; revision.
-; CORTEX-A7-NOFPU-FAST: .eabi_attribute 20, 2
 ; CORTEX-A7-FPUV4: .eabi_attribute      20, 1
-;; The VFPv4 FPU flushes preserving sign.
-; CORTEX-A7-FPUV4-FAST: .eabi_attribute 20, 2
 
 ; Tag_ABI_FP_exceptions
 ; CORTEX-A7-CHECK: .eabi_attribute      21, 1
@@ -610,13 +515,6 @@
 ; CORTEX-A5-DEFAULT:        .eabi_attribute 24, 1
 ; CORTEX-A5-DEFAULT:        .eabi_attribute 25, 1
 
-; CORTEX-A5-DEFAULT-FAST-NOT:   .eabi_attribute 19
-;; The A5 defaults to a VFPv4 FPU, so it flushed preserving the sign when -ffast-math
-;; is given.
-; CORTEX-A5-DEFAULT-FAST:        .eabi_attribute 20, 2
-; CORTEX-A5-DEFAULT-FAST-NOT: .eabi_attribute 21
-; CORTEX-A5-DEFAULT-FAST-NOT: .eabi_attribute 22
-; CORTEX-A5-DEFAULT-FAST: .eabi_attribute 23, 1
 
 ; CORTEX-A5-NONEON:        .cpu    cortex-a5
 ; CORTEX-A5-NONEON:        .eabi_attribute 6, 10
@@ -634,13 +532,6 @@
 ; CORTEX-A5-NONEON:        .eabi_attribute 24, 1
 ; CORTEX-A5-NONEON:        .eabi_attribute 25, 1
 
-; CORTEX-A5-NONEON-FAST-NOT:   .eabi_attribute 19
-;; The A5 defaults to a VFPv4 FPU, so it flushed preserving sign when -ffast-math
-;; is given.
-; CORTEX-A5-NONEON-FAST:        .eabi_attribute 20, 2
-; CORTEX-A5-NONEON-FAST-NOT: .eabi_attribute 21
-; CORTEX-A5-NONEON-FAST-NOT: .eabi_attribute 22
-; CORTEX-A5-NONEON-FAST: .eabi_attribute 23, 1
 
 ; CORTEX-A5-NOFPU:        .cpu    cortex-a5
 ; CORTEX-A5-NOFPU:        .eabi_attribute 6, 10
@@ -659,14 +550,9 @@
 ; CORTEX-A5-NOFPU:        .eabi_attribute 24, 1
 ; CORTEX-A5-NOFPU:        .eabi_attribute 25, 1
 
-; CORTEX-A5-NOFPU-FAST-NOT:   .eabi_attribute 19
 ;; Despite there being no FPU, we chose to flush to zero preserving
 ;; sign. This matches what the hardware would do for this architecture
 ;; revision.
-; CORTEX-A5-NOFPU-FAST: .eabi_attribute 20, 2
-; CORTEX-A5-NOFPU-FAST-NOT: .eabi_attribute 21
-; CORTEX-A5-NOFPU-FAST-NOT: .eabi_attribute 22
-; CORTEX-A5-NOFPU-FAST: .eabi_attribute 23, 1
 
 ; CORTEX-A8-SOFT:  .cpu cortex-a8
 ; CORTEX-A8-SOFT:  .eabi_attribute 6, 10
@@ -712,15 +598,6 @@
 ; CORTEX-A9-SOFT-NOT:  .eabi_attribute 28
 ; CORTEX-A9-SOFT:  .eabi_attribute 38, 1
 
-; CORTEX-A8-SOFT-FAST-NOT:   .eabi_attribute 19
-; CORTEX-A9-SOFT-FAST-NOT:   .eabi_attribute 19
-;; The A9 defaults to a VFPv3 FPU, so it flushes preserving the sign when
-;; -ffast-math is specified.
-; CORTEX-A8-SOFT-FAST:  .eabi_attribute 20, 2
-; CORTEX-A9-SOFT-FAST:  .eabi_attribute 20, 2
-; CORTEX-A5-SOFT-FAST-NOT: .eabi_attribute 21
-; CORTEX-A5-SOFT-FAST-NOT: .eabi_attribute 22
-; CORTEX-A5-SOFT-FAST: .eabi_attribute 23, 1
 
 ; CORTEX-A8-HARD:  .cpu cortex-a8
 ; CORTEX-A8-HARD:  .eabi_attribute 6, 10
@@ -766,21 +643,6 @@
 ; CORTEX-A9-HARD:  .eabi_attribute 28, 1
 ; CORTEX-A9-HARD:  .eabi_attribute 38, 1
 
-; CORTEX-A8-HARD-FAST-NOT:   .eabi_attribute 19
-;; The A8 defaults to a VFPv3 FPU, so it flushes preserving the sign when
-;; -ffast-math is specified.
-; CORTEX-A8-HARD-FAST:  .eabi_attribute 20, 2
-; CORTEX-A8-HARD-FAST-NOT:  .eabi_attribute 21
-; CORTEX-A8-HARD-FAST-NOT:  .eabi_attribute 22
-; CORTEX-A8-HARD-FAST:  .eabi_attribute 23, 1
-
-; CORTEX-A9-HARD-FAST-NOT:   .eabi_attribute 19
-;; The A9 defaults to a VFPv3 FPU, so it flushes preserving the sign when
-;; -ffast-math is specified.
-; CORTEX-A9-HARD-FAST:  .eabi_attribute 20, 2
-; CORTEX-A9-HARD-FAST-NOT:  .eabi_attribute 21
-; CORTEX-A9-HARD-FAST-NOT:  .eabi_attribute 22
-; CORTEX-A9-HARD-FAST:  .eabi_attribute 23, 1
 
 ; CORTEX-A12-DEFAULT:  .cpu cortex-a12
 ; CORTEX-A12-DEFAULT:  .eabi_attribute 6, 10
@@ -800,13 +662,6 @@
 ; CORTEX-A12-DEFAULT:  .eabi_attribute 24, 1
 ; CORTEX-A12-DEFAULT:  .eabi_attribute 25, 1
 
-; CORTEX-A12-DEFAULT-FAST-NOT:   .eabi_attribute 19
-;; The A12 defaults to a VFPv3 FPU, so it flushes preserving the sign when
-;; -ffast-math is specified.
-; CORTEX-A12-DEFAULT-FAST:  .eabi_attribute 20, 2
-; CORTEX-A12-HARD-FAST-NOT:  .eabi_attribute 21
-; CORTEX-A12-HARD-FAST-NOT:  .eabi_attribute 22
-; CORTEX-A12-HARD-FAST:  .eabi_attribute 23, 1
 
 ; CORTEX-A12-NOFPU:  .cpu cortex-a12
 ; CORTEX-A12-NOFPU:  .eabi_attribute 6, 10
@@ -826,14 +681,6 @@
 ; CORTEX-A12-NOFPU:  .eabi_attribute 24, 1
 ; CORTEX-A12-NOFPU:  .eabi_attribute 25, 1
 
-; CORTEX-A12-NOFPU-FAST-NOT:   .eabi_attribute 19
-;; Despite there being no FPU, we chose to flush to zero preserving
-;; sign. This matches what the hardware would do for this architecture
-;; revision.
-; CORTEX-A12-NOFPU-FAST:  .eabi_attribute 20, 2
-; CORTEX-A12-NOFPU-FAST-NOT:  .eabi_attribute 21
-; CORTEX-A12-NOFPU-FAST-NOT:  .eabi_attribute 22
-; CORTEX-A12-NOFPU-FAST:  .eabi_attribute 23, 1
 
 ; CORTEX-A15: .cpu cortex-a15
 ; CORTEX-A15: .eabi_attribute 6, 10
@@ -857,13 +704,6 @@
 ; CORTEX-A15-NOT: .eabi_attribute 28
 ; CORTEX-A15: .eabi_attribute 38, 1
 
-; CORTEX-A15-FAST-NOT:   .eabi_attribute 19
-;; The A15 defaults to a VFPv3 FPU, so it flushes preserving the sign when
-;; -ffast-math is specified.
-; CORTEX-A15-FAST: .eabi_attribute 20, 2
-; CORTEX-A15-FAST-NOT:  .eabi_attribute 21
-; CORTEX-A15-FAST-NOT:  .eabi_attribute 22
-; CORTEX-A15-FAST:  .eabi_attribute 23, 1
 
 ; CORTEX-A17-DEFAULT:  .cpu cortex-a17
 ; CORTEX-A17-DEFAULT:  .eabi_attribute 6, 10
@@ -883,13 +723,6 @@
 ; CORTEX-A17-DEFAULT:  .eabi_attribute 24, 1
 ; CORTEX-A17-DEFAULT:  .eabi_attribute 25, 1
 
-; CORTEX-A17-FAST-NOT:   .eabi_attribute 19
-;; The A17 defaults to a VFPv3 FPU, so it flushes preserving the sign when
-;; -ffast-math is specified.
-; CORTEX-A17-FAST:  .eabi_attribute 20, 2
-; CORTEX-A17-FAST-NOT:  .eabi_attribute 21
-; CORTEX-A17-FAST-NOT:  .eabi_attribute 22
-; CORTEX-A17-FAST:  .eabi_attribute 23, 1
 
 ; CORTEX-A17-NOFPU:  .cpu cortex-a17
 ; CORTEX-A17-NOFPU:  .eabi_attribute 6, 10
@@ -910,13 +743,6 @@
 ; CORTEX-A17-NOFPU:  .eabi_attribute 25, 1
 
 ; CORTEX-A17-NOFPU-NOT:   .eabi_attribute 19
-;; Despite there being no FPU, we chose to flush to zero preserving
-;; sign. This matches what the hardware would do for this architecture
-;; revision.
-; CORTEX-A17-NOFPU-FAST:  .eabi_attribute 20, 2
-; CORTEX-A17-NOFPU-FAST-NOT:  .eabi_attribute 21
-; CORTEX-A17-NOFPU-FAST-NOT:  .eabi_attribute 22
-; CORTEX-A17-NOFPU-FAST:  .eabi_attribute 23, 1
 
 ; Test flags -enable-no-trapping-fp-math and -denormal-fp-math:
 ; NO-TRAPPING-MATH:  .eabi_attribute 21, 0
@@ -946,16 +772,6 @@
 ; CORTEX-M0-NOT:  .eabi_attribute 28
 ; CORTEX-M0:  .eabi_attribute 38, 1
 
-; CORTEX-M0-FAST-NOT:   .eabi_attribute 19
-;; Despite the M0 CPU having no FPU in this scenario, we chose to
-;; flush to positive zero here. There's no hardware support doing
-;; this, but the fast maths software library might and such behaviour
-;; would match hardware support on this architecture revision if it
-;; existed.
-; CORTEX-M0-FAST-NOT:  .eabi_attribute 20
-; CORTEX-M0-FAST-NOT:  .eabi_attribute 21
-; CORTEX-M0-FAST-NOT:  .eabi_attribute 22
-; CORTEX-M0-FAST:  .eabi_attribute 23, 1
 
 ; CORTEX-M0PLUS:  .cpu cortex-m0plus
 ; CORTEX-M0PLUS:  .eabi_attribute 6, 12
@@ -978,16 +794,6 @@
 ; CORTEX-M0PLUS-NOT:  .eabi_attribute 28
 ; CORTEX-M0PLUS:  .eabi_attribute 38, 1
 
-; CORTEX-M0PLUS-FAST-NOT:   .eabi_attribute 19
-;; Despite the M0+ CPU having no FPU in this scenario, we chose to
-;; flush to positive zero here. There's no hardware support doing
-;; this, but the fast maths software library might and such behaviour
-;; would match hardware support on this architecture revision if it
-;; existed.
-; CORTEX-M0PLUS-FAST-NOT:  .eabi_attribute 20
-; CORTEX-M0PLUS-FAST-NOT:  .eabi_attribute 21
-; CORTEX-M0PLUS-FAST-NOT:  .eabi_attribute 22
-; CORTEX-M0PLUS-FAST:  .eabi_attribute 23, 1
 
 ; CORTEX-M1:  .cpu cortex-m1
 ; CORTEX-M1:  .eabi_attribute 6, 12
@@ -1010,16 +816,6 @@
 ; CORTEX-M1-NOT:  .eabi_attribute 28
 ; CORTEX-M1:  .eabi_attribute 38, 1
 
-; CORTEX-M1-FAST-NOT:   .eabi_attribute 19
-;; Despite the M1 CPU having no FPU in this scenario, we chose to
-;; flush to positive zero here. There's no hardware support doing
-;; this, but the fast maths software library might and such behaviour
-;; would match hardware support on this architecture revision if it
-;; existed.
-; CORTEX-M1-FAST-NOT:  .eabi_attribute 20
-; CORTEX-M1-FAST-NOT:  .eabi_attribute 21
-; CORTEX-M1-FAST-NOT:  .eabi_attribute 22
-; CORTEX-M1-FAST:  .eabi_attribute 23, 1
 
 ; SC000:  .cpu sc000
 ; SC000:  .eabi_attribute 6, 12
@@ -1041,16 +837,6 @@
 ; SC000-NOT:  .eabi_attribute 28
 ; SC000:  .eabi_attribute 38, 1
 
-; SC000-FAST-NOT:   .eabi_attribute 19
-;; Despite the SC000 CPU having no FPU in this scenario, we chose to
-;; flush to positive zero here. There's no hardware support doing
-;; this, but the fast maths software library might and such behaviour
-;; would match hardware support on this architecture revision if it
-;; existed.
-; SC000-FAST-NOT:  .eabi_attribute 20
-; SC000-FAST-NOT:  .eabi_attribute 21
-; SC000-FAST-NOT:  .eabi_attribute 22
-; SC000-FAST:  .eabi_attribute 23, 1
 
 ; CORTEX-M3:  .cpu cortex-m3
 ; CORTEX-M3:  .eabi_attribute 6, 10
@@ -1073,14 +859,6 @@
 ; CORTEX-M3-NOT:  .eabi_attribute 28
 ; CORTEX-M3:  .eabi_attribute 38, 1
 
-; CORTEX-M3-FAST-NOT:   .eabi_attribute 19
-;; Despite there being no FPU, we chose to flush to zero preserving
-;; sign. This matches what the hardware would do for this architecture
-;; revision.
-; CORTEX-M3-FAST:  .eabi_attribute 20, 2
-; CORTEX-M3-FAST-NOT:  .eabi_attribute 21
-; CORTEX-M3-FAST-NOT:  .eabi_attribute 22
-; CORTEX-M3-FAST:  .eabi_attribute 23, 1
 
 ; SC300:  .cpu sc300
 ; SC300:  .eabi_attribute 6, 10
@@ -1103,14 +881,6 @@
 ; SC300-NOT:  .eabi_attribute 28
 ; SC300:  .eabi_attribute 38, 1
 
-; SC300-FAST-NOT:   .eabi_attribute 19
-;; Despite there being no FPU, we chose to flush to zero preserving
-;; sign. This matches what the hardware would do for this architecture
-;; revision.
-; SC300-FAST:  .eabi_attribute 20, 2
-; SC300-FAST-NOT:  .eabi_attribute 21
-; SC300-FAST-NOT:  .eabi_attribute 22
-; SC300-FAST:  .eabi_attribute 23, 1
 
 ; CORTEX-M4-SOFT:  .cpu cortex-m4
 ; CORTEX-M4-SOFT:  .eabi_attribute 6, 13
@@ -1134,13 +904,6 @@
 ; CORTEX-M4-SOFT-NOT:  .eabi_attribute 28
 ; CORTEX-M4-SOFT:  .eabi_attribute 38, 1
 
-; CORTEX-M4-SOFT-FAST-NOT:   .eabi_attribute 19
-;; The M4 defaults to a VFPv4 FPU, so it flushes preserving the sign when
-;; -ffast-math is specified.
-; CORTEX-M4-SOFT-FAST:  .eabi_attribute 20, 2
-; CORTEX-M4-SOFT-FAST-NOT:  .eabi_attribute 21
-; CORTEX-M4-SOFT-FAST-NOT:  .eabi_attribute 22
-; CORTEX-M4-SOFT-FAST:  .eabi_attribute 23, 1
 
 ; CORTEX-M4-HARD:  .cpu cortex-m4
 ; CORTEX-M4-HARD:  .eabi_attribute 6, 13
@@ -1164,13 +927,6 @@
 ; CORTEX-M4-HARD:  .eabi_attribute 28, 1
 ; CORTEX-M4-HARD:  .eabi_attribute 38, 1
 
-; CORTEX-M4-HARD-FAST-NOT:   .eabi_attribute 19
-;; The M4 defaults to a VFPv4 FPU, so it flushes preserving the sign when
-;; -ffast-math is specified.
-; CORTEX-M4-HARD-FAST:  .eabi_attribute 20, 2
-; CORTEX-M4-HARD-FAST-NOT:  .eabi_attribute 21
-; CORTEX-M4-HARD-FAST-NOT:  .eabi_attribute 22
-; CORTEX-M4-HARD-FAST:  .eabi_attribute 23, 1
 
 ; CORTEX-M7:  .cpu    cortex-m7
 ; CORTEX-M7:  .eabi_attribute 6, 13
@@ -1197,16 +953,6 @@
 ; CORTEX-M7:  .eabi_attribute 38, 1
 ; CORTEX-M7:  .eabi_attribute 14, 0
 
-; CORTEX-M7-NOFPU-FAST-NOT:   .eabi_attribute 19
-;; The M7 has the ARMv8 FP unit, which always flushes preserving sign.
-; CORTEX-M7-FAST:  .eabi_attribute 20, 2
-;; Despite there being no FPU, we chose to flush to zero preserving
-;; sign. This matches what the hardware would do for this architecture
-;; revision.
-; CORTEX-M7-NOFPU-FAST: .eabi_attribute 20, 2
-; CORTEX-M7-NOFPU-FAST-NOT:  .eabi_attribute 21
-; CORTEX-M7-NOFPU-FAST-NOT:  .eabi_attribute 22
-; CORTEX-M7-NOFPU-FAST:  .eabi_attribute 23, 1
 
 ; CORTEX-R4:  .cpu cortex-r4
 ; CORTEX-R4:  .eabi_attribute 6, 10
@@ -1273,12 +1019,6 @@
 ; CORTEX-R5-NOT:  .eabi_attribute 28
 ; CORTEX-R5:  .eabi_attribute 38, 1
 
-; CORTEX-R5-FAST-NOT:   .eabi_attribute 19
-;; The R5 has the VFPv3 FP unit, which always flushes preserving sign.
-; CORTEX-R5-FAST:  .eabi_attribute 20, 2
-; CORTEX-R5-FAST-NOT:  .eabi_attribute 21
-; CORTEX-R5-FAST-NOT:  .eabi_attribute 22
-; CORTEX-R5-FAST:  .eabi_attribute 23, 1
 
 ; CORTEX-R7:  .cpu cortex-r7
 ; CORTEX-R7:  .eabi_attribute 6, 10
@@ -1301,12 +1041,6 @@
 ; CORTEX-R7-NOT:  .eabi_attribute 28
 ; CORTEX-R7:  .eabi_attribute 38, 1
 
-; CORTEX-R7-FAST-NOT:   .eabi_attribute 19
-;; The R7 has the VFPv3 FP unit, which always flushes preserving sign.
-; CORTEX-R7-FAST:  .eabi_attribute 20, 2
-; CORTEX-R7-FAST-NOT:  .eabi_attribute 21
-; CORTEX-R7-FAST-NOT:  .eabi_attribute 22
-; CORTEX-R7-FAST:  .eabi_attribute 23, 1
 
 ; CORTEX-R8:  .cpu cortex-r8
 ; CORTEX-R8:  .eabi_attribute 6, 10
@@ -1329,12 +1063,6 @@
 ; CORTEX-R8-NOT:  .eabi_attribute 28
 ; CORTEX-R8:  .eabi_attribute 38, 1
 
-; CORTEX-R8-FAST-NOT:   .eabi_attribute 19
-;; The R8 has the VFPv3 FP unit, which always flushes preserving sign.
-; CORTEX-R8-FAST:  .eabi_attribute 20, 2
-; CORTEX-R8-FAST-NOT:  .eabi_attribute 21
-; CORTEX-R8-FAST-NOT:  .eabi_attribute 22
-; CORTEX-R8-FAST:  .eabi_attribute 23, 1
 
 ; CORTEX-A32:  .cpu cortex-a32
 ; CORTEX-A32:  .eabi_attribute 6, 14
@@ -1359,12 +1087,6 @@
 ; CORTEX-A32-NOT:  .eabi_attribute 28
 ; CORTEX-A32:  .eabi_attribute 38, 1
 
-; CORTEX-A32-FAST-NOT:   .eabi_attribute 19
-;; The A32 has the ARMv8 FP unit, which always flushes preserving sign.
-; CORTEX-A32-FAST:  .eabi_attribute 20, 2
-; CORTEX-A32-FAST-NOT:  .eabi_attribute 21
-; CORTEX-A32-FAST-NOT:  .eabi_attribute 22
-; CORTEX-A32-FAST:  .eabi_attribute 23, 1
 
 ; CORTEX-M23:  .cpu cortex-m23
 ; CORTEX-M23:  .eabi_attribute 6, 16
@@ -1430,11 +1152,6 @@
 ; CORTEX-M35P:  .eabi_attribute 38, 1
 ; CORTEX-M35P:  .eabi_attribute 14, 0
 
-; CORTEX-M33-FAST-NOT:   .eabi_attribute 19
-; CORTEX-M33-FAST:  .eabi_attribute 20, 2
-; CORTEX-M33-FAST-NOT:  .eabi_attribute 21
-; CORTEX-M33-FAST-NOT:  .eabi_attribute 22
-; CORTEX-M33-FAST:  .eabi_attribute 23, 1
 
 ; CORTEX-A35:  .cpu cortex-a35
 ; CORTEX-A35:  .eabi_attribute 6, 14
@@ -1459,12 +1176,6 @@
 ; CORTEX-A35-NOT:  .eabi_attribute 28
 ; CORTEX-A35:  .eabi_attribute 38, 1
 
-; CORTEX-A35-FAST-NOT:   .eabi_attribute 19
-;; The A35 has the ARMv8 FP unit, which always flushes preserving sign.
-; CORTEX-A35-FAST:  .eabi_attribute 20, 2
-; CORTEX-A35-FAST-NOT:  .eabi_attribute 21
-; CORTEX-A35-FAST-NOT:  .eabi_attribute 22
-; CORTEX-A35-FAST:  .eabi_attribute 23, 1
 
 ; CORTEX-A53:  .cpu cortex-a53
 ; CORTEX-A53:  .eabi_attribute 6, 14
@@ -1489,12 +1200,6 @@
 ; CORTEX-A53-NOT:  .eabi_attribute 28
 ; CORTEX-A53:  .eabi_attribute 38, 1
 
-; CORTEX-A53-FAST-NOT:   .eabi_attribute 19
-;; The A53 has the ARMv8 FP unit, which always flushes preserving sign.
-; CORTEX-A53-FAST:  .eabi_attribute 20, 2
-; CORTEX-A53-FAST-NOT:  .eabi_attribute 21
-; CORTEX-A53-FAST-NOT:  .eabi_attribute 22
-; CORTEX-A53-FAST:  .eabi_attribute 23, 1
 
 ; CORTEX-A57:  .cpu cortex-a57
 ; CORTEX-A57:  .eabi_attribute 6, 14
@@ -1519,12 +1224,6 @@
 ; CORTEX-A57-NOT:  .eabi_attribute 28
 ; CORTEX-A57:  .eabi_attribute 38, 1
 
-; CORTEX-A57-FAST-NOT:   .eabi_attribute 19
-;; The A57 has the ARMv8 FP unit, which always flushes preserving sign.
-; CORTEX-A57-FAST:  .eabi_attribute 20, 2
-; CORTEX-A57-FAST-NOT:  .eabi_attribute 21
-; CORTEX-A57-FAST-NOT:  .eabi_attribute 22
-; CORTEX-A57-FAST:  .eabi_attribute 23, 1
 
 ; CORTEX-A72:  .cpu cortex-a72
 ; CORTEX-A72:  .eabi_attribute 6, 14
@@ -1549,12 +1248,6 @@
 ; CORTEX-A72-NOT:  .eabi_attribute 28
 ; CORTEX-A72:  .eabi_attribute 38, 1
 
-; CORTEX-A72-FAST-NOT:   .eabi_attribute 19
-;; The A72 has the ARMv8 FP unit, which always flushes preserving sign.
-; CORTEX-A72-FAST:  .eabi_attribute 20, 2
-; CORTEX-A72-FAST-NOT:  .eabi_attribute 21
-; CORTEX-A72-FAST-NOT:  .eabi_attribute 22
-; CORTEX-A72-FAST:  .eabi_attribute 23, 1
 
 ; CORTEX-A73:  .cpu cortex-a73
 ; CORTEX-A73:  .eabi_attribute 6, 14
@@ -1580,12 +1273,6 @@
 ; CORTEX-A73:  .eabi_attribute 38, 1
 ; CORTEX-A73:  .eabi_attribute 14, 0
 
-; EXYNOS-FAST-NOT:   .eabi_attribute 19
-;; The Exynos processors have the ARMv8 FP unit, which always flushes preserving sign.
-; EXYNOS-FAST:  .eabi_attribute 20, 2
-; EXYNOS-FAST-NOT:  .eabi_attribute 21
-; EXYNOS-FAST-NOT:  .eabi_attribute 22
-; EXYNOS-FAST:  .eabi_attribute 23, 1
 
 ; EXYNOS-M3:  .cpu exynos-m3
 ; EXYNOS-M3:  .eabi_attribute 6, 14
@@ -1684,12 +1371,6 @@
 ; GENERIC-ARMV8_1-A-NOT:  .eabi_attribute 28
 ; GENERIC-ARMV8_1-A:  .eabi_attribute 38, 1
 
-; GENERIC-ARMV8_1-A-FAST-NOT:   .eabi_attribute 19
-;; GENERIC-ARMV8_1-A has the ARMv8 FP unit, which always flushes preserving sign.
-; GENERIC-ARMV8_1-A-FAST:  .eabi_attribute 20, 2
-; GENERIC-ARMV8_1-A-FAST-NOT:  .eabi_attribute 21
-; GENERIC-ARMV8_1-A-FAST-NOT:  .eabi_attribute 22
-; GENERIC-ARMV8_1-A-FAST:  .eabi_attribute 23, 1
 
 ; RELOC-PIC:  .eabi_attribute 15, 1
 ; RELOC-PIC:  .eabi_attribute 16, 1
diff --git a/llvm/test/CodeGen/LoongArch/lasx/abs.ll b/llvm/test/CodeGen/LoongArch/lasx/abs.ll
new file mode 100644
index 0000000..e3b0d04d
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/lasx/abs.ll
@@ -0,0 +1,128 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lasx < %s | FileCheck %s
+; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
+
+define void @vabs_b(ptr %dst, ptr %src) {
+; CHECK-LABEL: vabs_b:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvneg.b $xr1, $xr0
+; CHECK-NEXT:    xvmax.b $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %a = load <32 x i8>, ptr %src
+  %b = tail call <32 x i8> @llvm.abs.v32i8(<32 x i8> %a, i1 true)
+  store <32 x i8> %b, ptr %dst
+  ret void
+}
+
+define void @vabs_b_1(ptr %dst, ptr %src) {
+; CHECK-LABEL: vabs_b_1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvneg.b $xr1, $xr0
+; CHECK-NEXT:    xvmax.b $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %a = load <32 x i8>, ptr %src
+  %b = tail call <32 x i8> @llvm.abs.v32i8(<32 x i8> %a, i1 false)
+  store <32 x i8> %b, ptr %dst
+  ret void
+}
+
+define void @vabs_h(ptr %dst, ptr %src) {
+; CHECK-LABEL: vabs_h:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvneg.h $xr1, $xr0
+; CHECK-NEXT:    xvmax.h $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %a = load <16 x i16>, ptr %src
+  %b = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> %a, i1 true)
+  store <16 x i16> %b, ptr %dst
+  ret void
+}
+
+define void @vabs_h_1(ptr %dst, ptr %src) {
+; CHECK-LABEL: vabs_h_1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvneg.h $xr1, $xr0
+; CHECK-NEXT:    xvmax.h $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %a = load <16 x i16>, ptr %src
+  %b = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> %a, i1 false)
+  store <16 x i16> %b, ptr %dst
+  ret void
+}
+
+define void @vabs_w(ptr %dst, ptr %src) {
+; CHECK-LABEL: vabs_w:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvneg.w $xr1, $xr0
+; CHECK-NEXT:    xvmax.w $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %a = load <8 x i32>, ptr %src
+  %b = tail call <8 x i32> @llvm.abs.v8i32(<8 x i32> %a, i1 true)
+  store <8 x i32> %b, ptr %dst
+  ret void
+}
+
+define void @vabs_w_1(ptr %dst, ptr %src) {
+; CHECK-LABEL: vabs_w_1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvneg.w $xr1, $xr0
+; CHECK-NEXT:    xvmax.w $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %a = load <8 x i32>, ptr %src
+  %b = tail call <8 x i32> @llvm.abs.v8i32(<8 x i32> %a, i1 false)
+  store <8 x i32> %b, ptr %dst
+  ret void
+}
+
+define void @vabs_d(ptr %dst, ptr %src) {
+; CHECK-LABEL: vabs_d:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvneg.d $xr1, $xr0
+; CHECK-NEXT:    xvmax.d $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %a = load <4 x i64>, ptr %src
+  %b = tail call <4 x i64> @llvm.abs.v4i64(<4 x i64> %a, i1 true)
+  store <4 x i64> %b, ptr %dst
+  ret void
+}
+
+define void @vabs_d_1(ptr %dst, ptr %src) {
+; CHECK-LABEL: vabs_d_1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvneg.d $xr1, $xr0
+; CHECK-NEXT:    xvmax.d $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %a = load <4 x i64>, ptr %src
+  %b = tail call <4 x i64> @llvm.abs.v4i64(<4 x i64> %a, i1 false)
+  store <4 x i64> %b, ptr %dst
+  ret void
+}
+
+declare <32 x i8> @llvm.abs.v32i8(<32 x i8>, i1)
+declare <16 x i16> @llvm.abs.v16i16(<16 x i16>, i1)
+declare <8 x i32> @llvm.abs.v8i32(<8 x i32>, i1)
+declare <4 x i64> @llvm.abs.v4i64(<4 x i64>, i1)
diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/adda.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/adda.ll
index e66a152..9868775 100644
--- a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/adda.ll
+++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/adda.ll
@@ -7,11 +7,7 @@ define void @vadda_b(ptr %res, ptr %a, ptr %b) nounwind {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xvld $xr0, $a1, 0
 ; CHECK-NEXT:    xvld $xr1, $a2, 0
-; CHECK-NEXT:    xvneg.b $xr2, $xr0
-; CHECK-NEXT:    xvmax.b $xr0, $xr0, $xr2
-; CHECK-NEXT:    xvneg.b $xr2, $xr1
-; CHECK-NEXT:    xvmax.b $xr1, $xr1, $xr2
-; CHECK-NEXT:    xvadd.b $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvadda.b $xr0, $xr0, $xr1
 ; CHECK-NEXT:    xvst $xr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
@@ -33,11 +29,7 @@ define void @vadda_h(ptr %res, ptr %a, ptr %b) nounwind {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xvld $xr0, $a1, 0
 ; CHECK-NEXT:    xvld $xr1, $a2, 0
-; CHECK-NEXT:    xvneg.h $xr2, $xr0
-; CHECK-NEXT:    xvmax.h $xr0, $xr0, $xr2
-; CHECK-NEXT:    xvneg.h $xr2, $xr1
-; CHECK-NEXT:    xvmax.h $xr1, $xr1, $xr2
-; CHECK-NEXT:    xvadd.h $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvadda.h $xr0, $xr0, $xr1
 ; CHECK-NEXT:    xvst $xr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
@@ -59,11 +51,7 @@ define void @vadda_w(ptr %res, ptr %a, ptr %b) nounwind {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xvld $xr0, $a1, 0
 ; CHECK-NEXT:    xvld $xr1, $a2, 0
-; CHECK-NEXT:    xvneg.w $xr2, $xr0
-; CHECK-NEXT:    xvmax.w $xr0, $xr0, $xr2
-; CHECK-NEXT:    xvneg.w $xr2, $xr1
-; CHECK-NEXT:    xvmax.w $xr1, $xr1, $xr2
-; CHECK-NEXT:    xvadd.w $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvadda.w $xr0, $xr0, $xr1
 ; CHECK-NEXT:    xvst $xr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
@@ -85,11 +73,7 @@ define void @vadda_d(ptr %res, ptr %a, ptr %b) nounwind {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xvld $xr0, $a1, 0
 ; CHECK-NEXT:    xvld $xr1, $a2, 0
-; CHECK-NEXT:    xvneg.d $xr2, $xr0
-; CHECK-NEXT:    xvmax.d $xr0, $xr0, $xr2
-; CHECK-NEXT:    xvneg.d $xr2, $xr1
-; CHECK-NEXT:    xvmax.d $xr1, $xr1, $xr2
-; CHECK-NEXT:    xvadd.d $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvadda.d $xr0, $xr0, $xr1
 ; CHECK-NEXT:    xvst $xr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/LoongArch/lasx/scalarize-fp.ll b/llvm/test/CodeGen/LoongArch/lasx/scalarize-fp.ll
new file mode 100644
index 0000000..39ac647
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/lasx/scalarize-fp.ll
@@ -0,0 +1,58 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc --mtriple=loongarch32 -mattr=+32s,+lasx < %s | FileCheck %s
+; RUN: llc --mtriple=loongarch64 -mattr=+lasx < %s | FileCheck %s
+
+define <8 x float> @fadd_elt0_v8f32(float %a) nounwind {
+; CHECK-LABEL: fadd_elt0_v8f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr1, -1168
+; CHECK-NEXT:    fadd.s $fa0, $fa0, $fa1
+; CHECK-NEXT:    ret
+entry:
+  %b = insertelement <8 x float> poison, float %a, i32 0
+  %c = fadd <8 x float> %b, <float 1.0, float poison, float poison, float poison, float poison, float poison, float poison, float poison>
+  ret <8 x float> %c
+}
+
+define <4 x double> @fadd_elt0_v4f64(double %a) nounwind {
+; CHECK-LABEL: fadd_elt0_v4f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr1, -912
+; CHECK-NEXT:    fadd.d $fa0, $fa0, $fa1
+; CHECK-NEXT:    ret
+entry:
+  %b = insertelement <4 x double> poison, double %a, i32 0
+  %c = fadd <4 x double> %b, <double 1.0, double poison, double poison, double poison>
+  ret <4 x double> %c
+}
+
+define <8 x float> @fsub_splat_v8f32(float %a, float %b) nounwind {
+; CHECK-LABEL: fsub_splat_v8f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsub.s $fa0, $fa0, $fa1
+; CHECK-NEXT:    xvreplve0.w $xr0, $xr0
+; CHECK-NEXT:    ret
+entry:
+  %insa = insertelement <8 x float> poison, float %a, i32 0
+  %insb = insertelement <8 x float> poison, float %b, i32 0
+  %va = shufflevector <8 x float> %insa, <8 x float> poison, <8 x i32> zeroinitializer
+  %vb = shufflevector <8 x float> %insb, <8 x float> poison, <8 x i32> zeroinitializer
+  %c = fsub <8 x float> %va, %vb
+  ret <8 x float> %c
+}
+
+define <4 x double> @fsub_splat_v4f64(double %a) nounwind {
+; CHECK-LABEL: fsub_splat_v4f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr1, -784
+; CHECK-NEXT:    fadd.d $fa0, $fa0, $fa1
+; CHECK-NEXT:    xvreplve0.d $xr0, $xr0
+; CHECK-NEXT:    ret
+entry:
+  %insa = insertelement <4 x double> poison, double %a, i32 0
+  %insb = insertelement <4 x double> poison, double 1.0, i32 0
+  %va = shufflevector <4 x double> %insa, <4 x double> poison, <4 x i32> zeroinitializer
+  %vb = shufflevector <4 x double> %insb, <4 x double> poison, <4 x i32> zeroinitializer
+  %c = fsub <4 x double> %va, %vb
+  ret <4 x double> %c
+}
diff --git a/llvm/test/CodeGen/LoongArch/lsx/abs.ll b/llvm/test/CodeGen/LoongArch/lsx/abs.ll
new file mode 100644
index 0000000..85fe1fe
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/lsx/abs.ll
@@ -0,0 +1,128 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lsx < %s | FileCheck %s
+; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
+
+define void @vabs_b(ptr %dst, ptr %src) {
+; CHECK-LABEL: vabs_b:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vneg.b $vr1, $vr0
+; CHECK-NEXT:    vmax.b $vr0, $vr0, $vr1
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %a = load <16 x i8>, ptr %src
+  %b = tail call <16 x i8> @llvm.abs.v16i8(<16 x i8> %a, i1 true)
+  store <16 x i8> %b, ptr %dst
+  ret void
+}
+
+define void @vabs_b_1(ptr %dst, ptr %src) {
+; CHECK-LABEL: vabs_b_1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vneg.b $vr1, $vr0
+; CHECK-NEXT:    vmax.b $vr0, $vr0, $vr1
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %a = load <16 x i8>, ptr %src
+  %b = tail call <16 x i8> @llvm.abs.v16i8(<16 x i8> %a, i1 false)
+  store <16 x i8> %b, ptr %dst
+  ret void
+}
+
+define void @vabs_h(ptr %dst, ptr %src) {
+; CHECK-LABEL: vabs_h:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vneg.h $vr1, $vr0
+; CHECK-NEXT:    vmax.h $vr0, $vr0, $vr1
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %a = load <8 x i16>, ptr %src
+  %b = tail call <8 x i16> @llvm.abs.v8i16(<8 x i16> %a, i1 true)
+  store <8 x i16> %b, ptr %dst
+  ret void
+}
+
+define void @vabs_h_1(ptr %dst, ptr %src) {
+; CHECK-LABEL: vabs_h_1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vneg.h $vr1, $vr0
+; CHECK-NEXT:    vmax.h $vr0, $vr0, $vr1
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %a = load <8 x i16>, ptr %src
+  %b = tail call <8 x i16> @llvm.abs.v8i16(<8 x i16> %a, i1 false)
+  store <8 x i16> %b, ptr %dst
+  ret void
+}
+
+define void @vabs_w(ptr %dst, ptr %src) {
+; CHECK-LABEL: vabs_w:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vneg.w $vr1, $vr0
+; CHECK-NEXT:    vmax.w $vr0, $vr0, $vr1
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %a = load <4 x i32>, ptr %src
+  %b = tail call <4 x i32> @llvm.abs.v4i32(<4 x i32> %a, i1 true)
+  store <4 x i32> %b, ptr %dst
+  ret void
+}
+
+define void @vabs_w_1(ptr %dst, ptr %src) {
+; CHECK-LABEL: vabs_w_1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vneg.w $vr1, $vr0
+; CHECK-NEXT:    vmax.w $vr0, $vr0, $vr1
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %a = load <4 x i32>, ptr %src
+  %b = tail call <4 x i32> @llvm.abs.v4i32(<4 x i32> %a, i1 false)
+  store <4 x i32> %b, ptr %dst
+  ret void
+}
+
+define void @vabs_d(ptr %dst, ptr %src) {
+; CHECK-LABEL: vabs_d:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vneg.d $vr1, $vr0
+; CHECK-NEXT:    vmax.d $vr0, $vr0, $vr1
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %a = load <2 x i64>, ptr %src
+  %b = tail call <2 x i64> @llvm.abs.v2i64(<2 x i64> %a, i1 true)
+  store <2 x i64> %b, ptr %dst
+  ret void
+}
+
+define void @vabs_d_1(ptr %dst, ptr %src) {
+; CHECK-LABEL: vabs_d_1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vneg.d $vr1, $vr0
+; CHECK-NEXT:    vmax.d $vr0, $vr0, $vr1
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %a = load <2 x i64>, ptr %src
+  %b = tail call <2 x i64> @llvm.abs.v2i64(<2 x i64> %a, i1 false)
+  store <2 x i64> %b, ptr %dst
+  ret void
+}
+
+declare <16 x i8> @llvm.abs.v16i8(<16 x i8>, i1)
+declare <8 x i16> @llvm.abs.v8i16(<8 x i16>, i1)
+declare <4 x i32> @llvm.abs.v4i32(<4 x i32>, i1)
+declare <2 x i64> @llvm.abs.v2i64(<2 x i64>, i1)
diff --git a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/adda.ll b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/adda.ll
index 2bd0b59..34f22e1f 100644
--- a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/adda.ll
+++ b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/adda.ll
@@ -7,11 +7,7 @@ define void @vadda_b(ptr %res, ptr %a, ptr %b) nounwind {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vld $vr0, $a1, 0
 ; CHECK-NEXT:    vld $vr1, $a2, 0
-; CHECK-NEXT:    vneg.b $vr2, $vr0
-; CHECK-NEXT:    vmax.b $vr0, $vr0, $vr2
-; CHECK-NEXT:    vneg.b $vr2, $vr1
-; CHECK-NEXT:    vmax.b $vr1, $vr1, $vr2
-; CHECK-NEXT:    vadd.b $vr0, $vr0, $vr1
+; CHECK-NEXT:    vadda.b $vr0, $vr0, $vr1
 ; CHECK-NEXT:    vst $vr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
@@ -33,11 +29,7 @@ define void @vadda_h(ptr %res, ptr %a, ptr %b) nounwind {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vld $vr0, $a1, 0
 ; CHECK-NEXT:    vld $vr1, $a2, 0
-; CHECK-NEXT:    vneg.h $vr2, $vr0
-; CHECK-NEXT:    vmax.h $vr0, $vr0, $vr2
-; CHECK-NEXT:    vneg.h $vr2, $vr1
-; CHECK-NEXT:    vmax.h $vr1, $vr1, $vr2
-; CHECK-NEXT:    vadd.h $vr0, $vr0, $vr1
+; CHECK-NEXT:    vadda.h $vr0, $vr0, $vr1
 ; CHECK-NEXT:    vst $vr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
@@ -59,11 +51,7 @@ define void @vadda_w(ptr %res, ptr %a, ptr %b) nounwind {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vld $vr0, $a1, 0
 ; CHECK-NEXT:    vld $vr1, $a2, 0
-; CHECK-NEXT:    vneg.w $vr2, $vr0
-; CHECK-NEXT:    vmax.w $vr0, $vr0, $vr2
-; CHECK-NEXT:    vneg.w $vr2, $vr1
-; CHECK-NEXT:    vmax.w $vr1, $vr1, $vr2
-; CHECK-NEXT:    vadd.w $vr0, $vr0, $vr1
+; CHECK-NEXT:    vadda.w $vr0, $vr0, $vr1
 ; CHECK-NEXT:    vst $vr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
@@ -85,11 +73,7 @@ define void @vadda_d(ptr %res, ptr %a, ptr %b) nounwind {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vld $vr0, $a1, 0
 ; CHECK-NEXT:    vld $vr1, $a2, 0
-; CHECK-NEXT:    vneg.d $vr2, $vr0
-; CHECK-NEXT:    vmax.d $vr0, $vr0, $vr2
-; CHECK-NEXT:    vneg.d $vr2, $vr1
-; CHECK-NEXT:    vmax.d $vr1, $vr1, $vr2
-; CHECK-NEXT:    vadd.d $vr0, $vr0, $vr1
+; CHECK-NEXT:    vadda.d $vr0, $vr0, $vr1
 ; CHECK-NEXT:    vst $vr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/LoongArch/lsx/scalarize-fp.ll b/llvm/test/CodeGen/LoongArch/lsx/scalarize-fp.ll
new file mode 100644
index 0000000..b651f11
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/lsx/scalarize-fp.ll
@@ -0,0 +1,58 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc --mtriple=loongarch32 -mattr=+32s,+lsx < %s | FileCheck %s
+; RUN: llc --mtriple=loongarch64 -mattr=+lsx < %s | FileCheck %s
+
+define <4 x float> @fadd_elt0_v4f32(float %a) nounwind {
+; CHECK-LABEL: fadd_elt0_v4f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr1, -1168
+; CHECK-NEXT:    fadd.s $fa0, $fa0, $fa1
+; CHECK-NEXT:    ret
+entry:
+  %b = insertelement <4 x float> poison, float %a, i32 0
+  %c = fadd <4 x float> %b, <float 1.0, float poison, float poison, float poison>
+  ret <4 x float> %c
+}
+
+define <2 x double> @fadd_elt0_v2f64(double %a) nounwind {
+; CHECK-LABEL: fadd_elt0_v2f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr1, -912
+; CHECK-NEXT:    fadd.d $fa0, $fa0, $fa1
+; CHECK-NEXT:    ret
+entry:
+  %b = insertelement <2 x double> poison, double %a, i32 0
+  %c = fadd <2 x double> %b, <double 1.0, double poison>
+  ret <2 x double> %c
+}
+
+define <4 x float> @fsub_splat_v4f32(float %b) nounwind {
+; CHECK-LABEL: fsub_splat_v4f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr1, -1168
+; CHECK-NEXT:    fsub.s $fa0, $fa1, $fa0
+; CHECK-NEXT:    vreplvei.w $vr0, $vr0, 0
+; CHECK-NEXT:    ret
+entry:
+  %insa = insertelement <4 x float> poison, float 1.0, i32 0
+  %insb = insertelement <4 x float> poison, float %b, i32 0
+  %va = shufflevector <4 x float> %insa, <4 x float> poison, <4 x i32> zeroinitializer
+  %vb = shufflevector <4 x float> %insb, <4 x float> poison, <4 x i32> zeroinitializer
+  %c = fsub <4 x float> %va, %vb
+  ret <4 x float> %c
+}
+
+define <2 x double> @fsub_splat_v2f64(double %a, double %b) nounwind {
+; CHECK-LABEL: fsub_splat_v2f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fsub.d $fa0, $fa0, $fa1
+; CHECK-NEXT:    vreplvei.d $vr0, $vr0, 0
+; CHECK-NEXT:    ret
+entry:
+  %insa = insertelement <2 x double> poison, double %a, i32 0
+  %insb = insertelement <2 x double> poison, double %b, i32 0
+  %va = shufflevector <2 x double> %insa, <2 x double> poison, <2 x i32> zeroinitializer
+  %vb = shufflevector <2 x double> %insb, <2 x double> poison, <2 x i32> zeroinitializer
+  %c = fsub <2 x double> %va, %vb
+  ret <2 x double> %c
+}
diff --git a/llvm/test/CodeGen/LoongArch/merge-offset-option.ll b/llvm/test/CodeGen/LoongArch/merge-offset-option.ll
new file mode 100644
index 0000000..e5351a6
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/merge-offset-option.ll
@@ -0,0 +1,24 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc --mtriple=loongarch64 -mattr=+d --relocation-model=static -O1 \
+; RUN:     < %s | FileCheck %s --check-prefix=MERGE
+; RUN: llc --mtriple=loongarch64 -mattr=+d --relocation-model=static -O1 \
+; RUN:     --loongarch-enable-merge-offset=false < %s | FileCheck %s --check-prefix=NO_MERGE
+
+@g = dso_local global i32 zeroinitializer, align 4
+
+define void @foo() nounwind {
+; MERGE-LABEL: foo:
+; MERGE:       # %bb.0:
+; MERGE-NEXT:    pcalau12i $a0, %pc_hi20(g)
+; MERGE-NEXT:    ld.w $zero, $a0, %pc_lo12(g)
+; MERGE-NEXT:    ret
+;
+; NO_MERGE-LABEL: foo:
+; NO_MERGE:       # %bb.0:
+; NO_MERGE-NEXT:    pcalau12i $a0, %pc_hi20(g)
+; NO_MERGE-NEXT:    addi.d $a0, $a0, %pc_lo12(g)
+; NO_MERGE-NEXT:    ld.w $zero, $a0, 0
+; NO_MERGE-NEXT:    ret
+  %v = load volatile i32, ptr @g
+  ret void
+}
diff --git a/llvm/test/CodeGen/PowerPC/scalar_cmp.ll b/llvm/test/CodeGen/PowerPC/scalar_cmp.ll
index aaabd76e..fd0b494 100644
--- a/llvm/test/CodeGen/PowerPC/scalar_cmp.ll
+++ b/llvm/test/CodeGen/PowerPC/scalar_cmp.ll
@@ -20,18 +20,18 @@
 define float @select_oeq_float(float %a, float %b, float %c, float %d) {
 ; FAST-P8-LABEL: select_oeq_float:
 ; FAST-P8:       # %bb.0: # %entry
-; FAST-P8-NEXT:    xssubsp f0, f2, f1
-; FAST-P8-NEXT:    xssubsp f1, f1, f2
-; FAST-P8-NEXT:    fsel f1, f1, f3, f4
-; FAST-P8-NEXT:    fsel f1, f0, f1, f4
+; FAST-P8-NEXT:    xssubsp f0, f1, f2
+; FAST-P8-NEXT:    xsnegdp f1, f0
+; FAST-P8-NEXT:    fsel f0, f0, f3, f4
+; FAST-P8-NEXT:    fsel f1, f1, f0, f4
 ; FAST-P8-NEXT:    blr
 ;
 ; FAST-P9-LABEL: select_oeq_float:
 ; FAST-P9:       # %bb.0: # %entry
-; FAST-P9-NEXT:    xssubsp f0, f2, f1
-; FAST-P9-NEXT:    xssubsp f1, f1, f2
-; FAST-P9-NEXT:    fsel f1, f1, f3, f4
-; FAST-P9-NEXT:    fsel f1, f0, f1, f4
+; FAST-P9-NEXT:    xssubsp f0, f1, f2
+; FAST-P9-NEXT:    xsnegdp f1, f0
+; FAST-P9-NEXT:    fsel f0, f0, f3, f4
+; FAST-P9-NEXT:    fsel f1, f1, f0, f4
 ; FAST-P9-NEXT:    blr
 ;
 ; NO-FAST-P8-LABEL: select_oeq_float:
@@ -59,6 +59,48 @@ entry:
   ret float %cond
 }
 
+define float @select_oeq_float_nsz(float %a, float %b, float %c, float %d) {
+; FAST-P8-LABEL: select_oeq_float_nsz:
+; FAST-P8:       # %bb.0: # %entry
+; FAST-P8-NEXT:    xssubsp f0, f2, f1
+; FAST-P8-NEXT:    xssubsp f1, f1, f2
+; FAST-P8-NEXT:    fsel f1, f1, f3, f4
+; FAST-P8-NEXT:    fsel f1, f0, f1, f4
+; FAST-P8-NEXT:    blr
+;
+; FAST-P9-LABEL: select_oeq_float_nsz:
+; FAST-P9:       # %bb.0: # %entry
+; FAST-P9-NEXT:    xssubsp f0, f2, f1
+; FAST-P9-NEXT:    xssubsp f1, f1, f2
+; FAST-P9-NEXT:    fsel f1, f1, f3, f4
+; FAST-P9-NEXT:    fsel f1, f0, f1, f4
+; FAST-P9-NEXT:    blr
+;
+; NO-FAST-P8-LABEL: select_oeq_float_nsz:
+; NO-FAST-P8:       # %bb.0: # %entry
+; NO-FAST-P8-NEXT:    fcmpu cr0, f1, f2
+; NO-FAST-P8-NEXT:    beq cr0, .LBB1_2
+; NO-FAST-P8-NEXT:  # %bb.1: # %entry
+; NO-FAST-P8-NEXT:    fmr f3, f4
+; NO-FAST-P8-NEXT:  .LBB1_2: # %entry
+; NO-FAST-P8-NEXT:    fmr f1, f3
+; NO-FAST-P8-NEXT:    blr
+;
+; NO-FAST-P9-LABEL: select_oeq_float_nsz:
+; NO-FAST-P9:       # %bb.0: # %entry
+; NO-FAST-P9-NEXT:    fcmpu cr0, f1, f2
+; NO-FAST-P9-NEXT:    beq cr0, .LBB1_2
+; NO-FAST-P9-NEXT:  # %bb.1: # %entry
+; NO-FAST-P9-NEXT:    fmr f3, f4
+; NO-FAST-P9-NEXT:  .LBB1_2: # %entry
+; NO-FAST-P9-NEXT:    fmr f1, f3
+; NO-FAST-P9-NEXT:    blr
+entry:
+  %cmp = fcmp nsz oeq float %a, %b
+  %cond = select i1 %cmp, float %c, float %d
+  ret float %cond
+}
+
 define double @select_oeq_double(double %a, double %b, double %c, double %d) {
 ; FAST-P8-LABEL: select_oeq_double:
 ; FAST-P8:       # %bb.0: # %entry
@@ -79,20 +121,20 @@ define double @select_oeq_double(double %a, double %b, double %c, double %d) {
 ; NO-FAST-P8-LABEL: select_oeq_double:
 ; NO-FAST-P8:       # %bb.0: # %entry
 ; NO-FAST-P8-NEXT:    xscmpudp cr0, f1, f2
-; NO-FAST-P8-NEXT:    beq cr0, .LBB1_2
+; NO-FAST-P8-NEXT:    beq cr0, .LBB2_2
 ; NO-FAST-P8-NEXT:  # %bb.1: # %entry
 ; NO-FAST-P8-NEXT:    fmr f3, f4
-; NO-FAST-P8-NEXT:  .LBB1_2: # %entry
+; NO-FAST-P8-NEXT:  .LBB2_2: # %entry
 ; NO-FAST-P8-NEXT:    fmr f1, f3
 ; NO-FAST-P8-NEXT:    blr
 ;
 ; NO-FAST-P9-LABEL: select_oeq_double:
 ; NO-FAST-P9:       # %bb.0: # %entry
 ; NO-FAST-P9-NEXT:    xscmpudp cr0, f1, f2
-; NO-FAST-P9-NEXT:    beq cr0, .LBB1_2
+; NO-FAST-P9-NEXT:    beq cr0, .LBB2_2
 ; NO-FAST-P9-NEXT:  # %bb.1: # %entry
 ; NO-FAST-P9-NEXT:    fmr f3, f4
-; NO-FAST-P9-NEXT:  .LBB1_2: # %entry
+; NO-FAST-P9-NEXT:  .LBB2_2: # %entry
 ; NO-FAST-P9-NEXT:    fmr f1, f3
 ; NO-FAST-P9-NEXT:    blr
 entry:
@@ -182,13 +224,57 @@ entry:
 define float @select_one_float(float %a, float %b, float %c, float %d) {
 ; FAST-P8-LABEL: select_one_float:
 ; FAST-P8:       # %bb.0: # %entry
+; FAST-P8-NEXT:    xssubsp f0, f1, f2
+; FAST-P8-NEXT:    xsnegdp f1, f0
+; FAST-P8-NEXT:    fsel f0, f0, f4, f3
+; FAST-P8-NEXT:    fsel f1, f1, f0, f3
+; FAST-P8-NEXT:    blr
+;
+; FAST-P9-LABEL: select_one_float:
+; FAST-P9:       # %bb.0: # %entry
+; FAST-P9-NEXT:    xssubsp f0, f1, f2
+; FAST-P9-NEXT:    xsnegdp f1, f0
+; FAST-P9-NEXT:    fsel f0, f0, f4, f3
+; FAST-P9-NEXT:    fsel f1, f1, f0, f3
+; FAST-P9-NEXT:    blr
+;
+; NO-FAST-P8-LABEL: select_one_float:
+; NO-FAST-P8:       # %bb.0: # %entry
+; NO-FAST-P8-NEXT:    fcmpu cr0, f1, f2
+; NO-FAST-P8-NEXT:    crnor 4*cr5+lt, un, eq
+; NO-FAST-P8-NEXT:    bc 12, 4*cr5+lt, .LBB5_2
+; NO-FAST-P8-NEXT:  # %bb.1: # %entry
+; NO-FAST-P8-NEXT:    fmr f3, f4
+; NO-FAST-P8-NEXT:  .LBB5_2: # %entry
+; NO-FAST-P8-NEXT:    fmr f1, f3
+; NO-FAST-P8-NEXT:    blr
+;
+; NO-FAST-P9-LABEL: select_one_float:
+; NO-FAST-P9:       # %bb.0: # %entry
+; NO-FAST-P9-NEXT:    fcmpu cr0, f1, f2
+; NO-FAST-P9-NEXT:    crnor 4*cr5+lt, un, eq
+; NO-FAST-P9-NEXT:    bc 12, 4*cr5+lt, .LBB5_2
+; NO-FAST-P9-NEXT:  # %bb.1: # %entry
+; NO-FAST-P9-NEXT:    fmr f3, f4
+; NO-FAST-P9-NEXT:  .LBB5_2: # %entry
+; NO-FAST-P9-NEXT:    fmr f1, f3
+; NO-FAST-P9-NEXT:    blr
+entry:
+  %cmp = fcmp one float %a, %b
+  %cond = select i1 %cmp, float %c, float %d
+  ret float %cond
+}
+
+define float @select_one_float_nsz(float %a, float %b, float %c, float %d) {
+; FAST-P8-LABEL: select_one_float_nsz:
+; FAST-P8:       # %bb.0: # %entry
 ; FAST-P8-NEXT:    xssubsp f0, f2, f1
 ; FAST-P8-NEXT:    xssubsp f1, f1, f2
 ; FAST-P8-NEXT:    fsel f1, f1, f4, f3
 ; FAST-P8-NEXT:    fsel f1, f0, f1, f3
 ; FAST-P8-NEXT:    blr
 ;
-; FAST-P9-LABEL: select_one_float:
+; FAST-P9-LABEL: select_one_float_nsz:
 ; FAST-P9:       # %bb.0: # %entry
 ; FAST-P9-NEXT:    xssubsp f0, f2, f1
 ; FAST-P9-NEXT:    xssubsp f1, f1, f2
@@ -196,29 +282,29 @@ define float @select_one_float(float %a, float %b, float %c, float %d) {
 ; FAST-P9-NEXT:    fsel f1, f0, f1, f3
 ; FAST-P9-NEXT:    blr
 ;
-; NO-FAST-P8-LABEL: select_one_float:
+; NO-FAST-P8-LABEL: select_one_float_nsz:
 ; NO-FAST-P8:       # %bb.0: # %entry
 ; NO-FAST-P8-NEXT:    fcmpu cr0, f1, f2
 ; NO-FAST-P8-NEXT:    crnor 4*cr5+lt, un, eq
-; NO-FAST-P8-NEXT:    bc 12, 4*cr5+lt, .LBB4_2
+; NO-FAST-P8-NEXT:    bc 12, 4*cr5+lt, .LBB6_2
 ; NO-FAST-P8-NEXT:  # %bb.1: # %entry
 ; NO-FAST-P8-NEXT:    fmr f3, f4
-; NO-FAST-P8-NEXT:  .LBB4_2: # %entry
+; NO-FAST-P8-NEXT:  .LBB6_2: # %entry
 ; NO-FAST-P8-NEXT:    fmr f1, f3
 ; NO-FAST-P8-NEXT:    blr
 ;
-; NO-FAST-P9-LABEL: select_one_float:
+; NO-FAST-P9-LABEL: select_one_float_nsz:
 ; NO-FAST-P9:       # %bb.0: # %entry
 ; NO-FAST-P9-NEXT:    fcmpu cr0, f1, f2
 ; NO-FAST-P9-NEXT:    crnor 4*cr5+lt, un, eq
-; NO-FAST-P9-NEXT:    bc 12, 4*cr5+lt, .LBB4_2
+; NO-FAST-P9-NEXT:    bc 12, 4*cr5+lt, .LBB6_2
 ; NO-FAST-P9-NEXT:  # %bb.1: # %entry
 ; NO-FAST-P9-NEXT:    fmr f3, f4
-; NO-FAST-P9-NEXT:  .LBB4_2: # %entry
+; NO-FAST-P9-NEXT:  .LBB6_2: # %entry
 ; NO-FAST-P9-NEXT:    fmr f1, f3
 ; NO-FAST-P9-NEXT:    blr
 entry:
-  %cmp = fcmp one float %a, %b
+  %cmp = fcmp nsz one float %a, %b
   %cond = select i1 %cmp, float %c, float %d
   ret float %cond
 }
@@ -244,10 +330,10 @@ define double @select_one_double(double %a, double %b, double %c, double %d) {
 ; NO-FAST-P8:       # %bb.0: # %entry
 ; NO-FAST-P8-NEXT:    fcmpu cr0, f1, f2
 ; NO-FAST-P8-NEXT:    crnor 4*cr5+lt, un, eq
-; NO-FAST-P8-NEXT:    bc 12, 4*cr5+lt, .LBB5_2
+; NO-FAST-P8-NEXT:    bc 12, 4*cr5+lt, .LBB7_2
 ; NO-FAST-P8-NEXT:  # %bb.1: # %entry
 ; NO-FAST-P8-NEXT:    fmr f3, f4
-; NO-FAST-P8-NEXT:  .LBB5_2: # %entry
+; NO-FAST-P8-NEXT:  .LBB7_2: # %entry
 ; NO-FAST-P8-NEXT:    fmr f1, f3
 ; NO-FAST-P8-NEXT:    blr
 ;
@@ -255,10 +341,10 @@ define double @select_one_double(double %a, double %b, double %c, double %d) {
 ; NO-FAST-P9:       # %bb.0: # %entry
 ; NO-FAST-P9-NEXT:    fcmpu cr0, f1, f2
 ; NO-FAST-P9-NEXT:    crnor 4*cr5+lt, un, eq
-; NO-FAST-P9-NEXT:    bc 12, 4*cr5+lt, .LBB5_2
+; NO-FAST-P9-NEXT:    bc 12, 4*cr5+lt, .LBB7_2
 ; NO-FAST-P9-NEXT:  # %bb.1: # %entry
 ; NO-FAST-P9-NEXT:    fmr f3, f4
-; NO-FAST-P9-NEXT:  .LBB5_2: # %entry
+; NO-FAST-P9-NEXT:  .LBB7_2: # %entry
 ; NO-FAST-P9-NEXT:    fmr f1, f3
 ; NO-FAST-P9-NEXT:    blr
 entry:
@@ -362,10 +448,10 @@ define float @select_oge_float(float %a, float %b, float %c, float %d) {
 ; NO-FAST-P8:       # %bb.0: # %entry
 ; NO-FAST-P8-NEXT:    fcmpu cr0, f1, f2
 ; NO-FAST-P8-NEXT:    crnor 4*cr5+lt, un, lt
-; NO-FAST-P8-NEXT:    bc 12, 4*cr5+lt, .LBB8_2
+; NO-FAST-P8-NEXT:    bc 12, 4*cr5+lt, .LBB10_2
 ; NO-FAST-P8-NEXT:  # %bb.1: # %entry
 ; NO-FAST-P8-NEXT:    fmr f3, f4
-; NO-FAST-P8-NEXT:  .LBB8_2: # %entry
+; NO-FAST-P8-NEXT:  .LBB10_2: # %entry
 ; NO-FAST-P8-NEXT:    fmr f1, f3
 ; NO-FAST-P8-NEXT:    blr
 ;
@@ -373,10 +459,10 @@ define float @select_oge_float(float %a, float %b, float %c, float %d) {
 ; NO-FAST-P9:       # %bb.0: # %entry
 ; NO-FAST-P9-NEXT:    fcmpu cr0, f1, f2
 ; NO-FAST-P9-NEXT:    crnor 4*cr5+lt, un, lt
-; NO-FAST-P9-NEXT:    bc 12, 4*cr5+lt, .LBB8_2
+; NO-FAST-P9-NEXT:    bc 12, 4*cr5+lt, .LBB10_2
 ; NO-FAST-P9-NEXT:  # %bb.1: # %entry
 ; NO-FAST-P9-NEXT:    fmr f3, f4
-; NO-FAST-P9-NEXT:  .LBB8_2: # %entry
+; NO-FAST-P9-NEXT:  .LBB10_2: # %entry
 ; NO-FAST-P9-NEXT:    fmr f1, f3
 ; NO-FAST-P9-NEXT:    blr
 entry:
@@ -402,10 +488,10 @@ define double @select_oge_double(double %a, double %b, double %c, double %d) {
 ; NO-FAST-P8:       # %bb.0: # %entry
 ; NO-FAST-P8-NEXT:    fcmpu cr0, f1, f2
 ; NO-FAST-P8-NEXT:    crnor 4*cr5+lt, un, lt
-; NO-FAST-P8-NEXT:    bc 12, 4*cr5+lt, .LBB9_2
+; NO-FAST-P8-NEXT:    bc 12, 4*cr5+lt, .LBB11_2
 ; NO-FAST-P8-NEXT:  # %bb.1: # %entry
 ; NO-FAST-P8-NEXT:    fmr f3, f4
-; NO-FAST-P8-NEXT:  .LBB9_2: # %entry
+; NO-FAST-P8-NEXT:  .LBB11_2: # %entry
 ; NO-FAST-P8-NEXT:    fmr f1, f3
 ; NO-FAST-P8-NEXT:    blr
 ;
@@ -413,10 +499,10 @@ define double @select_oge_double(double %a, double %b, double %c, double %d) {
 ; NO-FAST-P9:       # %bb.0: # %entry
 ; NO-FAST-P9-NEXT:    fcmpu cr0, f1, f2
 ; NO-FAST-P9-NEXT:    crnor 4*cr5+lt, un, lt
-; NO-FAST-P9-NEXT:    bc 12, 4*cr5+lt, .LBB9_2
+; NO-FAST-P9-NEXT:    bc 12, 4*cr5+lt, .LBB11_2
 ; NO-FAST-P9-NEXT:  # %bb.1: # %entry
 ; NO-FAST-P9-NEXT:    fmr f3, f4
-; NO-FAST-P9-NEXT:  .LBB9_2: # %entry
+; NO-FAST-P9-NEXT:  .LBB11_2: # %entry
 ; NO-FAST-P9-NEXT:    fmr f1, f3
 ; NO-FAST-P9-NEXT:    blr
 entry:
@@ -503,20 +589,20 @@ define float @select_olt_float(float %a, float %b, float %c, float %d) {
 ; NO-FAST-P8-LABEL: select_olt_float:
 ; NO-FAST-P8:       # %bb.0: # %entry
 ; NO-FAST-P8-NEXT:    fcmpu cr0, f1, f2
-; NO-FAST-P8-NEXT:    blt cr0, .LBB12_2
+; NO-FAST-P8-NEXT:    blt cr0, .LBB14_2
 ; NO-FAST-P8-NEXT:  # %bb.1: # %entry
 ; NO-FAST-P8-NEXT:    fmr f3, f4
-; NO-FAST-P8-NEXT:  .LBB12_2: # %entry
+; NO-FAST-P8-NEXT:  .LBB14_2: # %entry
 ; NO-FAST-P8-NEXT:    fmr f1, f3
 ; NO-FAST-P8-NEXT:    blr
 ;
 ; NO-FAST-P9-LABEL: select_olt_float:
 ; NO-FAST-P9:       # %bb.0: # %entry
 ; NO-FAST-P9-NEXT:    fcmpu cr0, f1, f2
-; NO-FAST-P9-NEXT:    blt cr0, .LBB12_2
+; NO-FAST-P9-NEXT:    blt cr0, .LBB14_2
 ; NO-FAST-P9-NEXT:  # %bb.1: # %entry
 ; NO-FAST-P9-NEXT:    fmr f3, f4
-; NO-FAST-P9-NEXT:  .LBB12_2: # %entry
+; NO-FAST-P9-NEXT:  .LBB14_2: # %entry
 ; NO-FAST-P9-NEXT:    fmr f1, f3
 ; NO-FAST-P9-NEXT:    blr
 entry:
@@ -541,20 +627,20 @@ define double @select_olt_double(double %a, double %b, double %c, double %d) {
 ; NO-FAST-P8-LABEL: select_olt_double:
 ; NO-FAST-P8:       # %bb.0: # %entry
 ; NO-FAST-P8-NEXT:    xscmpudp cr0, f1, f2
-; NO-FAST-P8-NEXT:    blt cr0, .LBB13_2
+; NO-FAST-P8-NEXT:    blt cr0, .LBB15_2
 ; NO-FAST-P8-NEXT:  # %bb.1: # %entry
 ; NO-FAST-P8-NEXT:    fmr f3, f4
-; NO-FAST-P8-NEXT:  .LBB13_2: # %entry
+; NO-FAST-P8-NEXT:  .LBB15_2: # %entry
 ; NO-FAST-P8-NEXT:    fmr f1, f3
 ; NO-FAST-P8-NEXT:    blr
 ;
 ; NO-FAST-P9-LABEL: select_olt_double:
 ; NO-FAST-P9:       # %bb.0: # %entry
 ; NO-FAST-P9-NEXT:    xscmpudp cr0, f1, f2
-; NO-FAST-P9-NEXT:    blt cr0, .LBB13_2
+; NO-FAST-P9-NEXT:    blt cr0, .LBB15_2
 ; NO-FAST-P9-NEXT:  # %bb.1: # %entry
 ; NO-FAST-P9-NEXT:    fmr f3, f4
-; NO-FAST-P9-NEXT:  .LBB13_2: # %entry
+; NO-FAST-P9-NEXT:  .LBB15_2: # %entry
 ; NO-FAST-P9-NEXT:    fmr f1, f3
 ; NO-FAST-P9-NEXT:    blr
 entry:
@@ -641,20 +727,20 @@ define float @select_ogt_float(float %a, float %b, float %c, float %d) {
 ; NO-FAST-P8-LABEL: select_ogt_float:
 ; NO-FAST-P8:       # %bb.0: # %entry
 ; NO-FAST-P8-NEXT:    fcmpu cr0, f1, f2
-; NO-FAST-P8-NEXT:    bgt cr0, .LBB16_2
+; NO-FAST-P8-NEXT:    bgt cr0, .LBB18_2
 ; NO-FAST-P8-NEXT:  # %bb.1: # %entry
 ; NO-FAST-P8-NEXT:    fmr f3, f4
-; NO-FAST-P8-NEXT:  .LBB16_2: # %entry
+; NO-FAST-P8-NEXT:  .LBB18_2: # %entry
 ; NO-FAST-P8-NEXT:    fmr f1, f3
 ; NO-FAST-P8-NEXT:    blr
 ;
 ; NO-FAST-P9-LABEL: select_ogt_float:
 ; NO-FAST-P9:       # %bb.0: # %entry
 ; NO-FAST-P9-NEXT:    fcmpu cr0, f1, f2
-; NO-FAST-P9-NEXT:    bgt cr0, .LBB16_2
+; NO-FAST-P9-NEXT:    bgt cr0, .LBB18_2
 ; NO-FAST-P9-NEXT:  # %bb.1: # %entry
 ; NO-FAST-P9-NEXT:    fmr f3, f4
-; NO-FAST-P9-NEXT:  .LBB16_2: # %entry
+; NO-FAST-P9-NEXT:  .LBB18_2: # %entry
 ; NO-FAST-P9-NEXT:    fmr f1, f3
 ; NO-FAST-P9-NEXT:    blr
 entry:
@@ -679,20 +765,20 @@ define double @select_ogt_double(double %a, double %b, double %c, double %d) {
 ; NO-FAST-P8-LABEL: select_ogt_double:
 ; NO-FAST-P8:       # %bb.0: # %entry
 ; NO-FAST-P8-NEXT:    xscmpudp cr0, f1, f2
-; NO-FAST-P8-NEXT:    bgt cr0, .LBB17_2
+; NO-FAST-P8-NEXT:    bgt cr0, .LBB19_2
 ; NO-FAST-P8-NEXT:  # %bb.1: # %entry
 ; NO-FAST-P8-NEXT:    fmr f3, f4
-; NO-FAST-P8-NEXT:  .LBB17_2: # %entry
+; NO-FAST-P8-NEXT:  .LBB19_2: # %entry
 ; NO-FAST-P8-NEXT:    fmr f1, f3
 ; NO-FAST-P8-NEXT:    blr
 ;
 ; NO-FAST-P9-LABEL: select_ogt_double:
 ; NO-FAST-P9:       # %bb.0: # %entry
 ; NO-FAST-P9-NEXT:    xscmpudp cr0, f1, f2
-; NO-FAST-P9-NEXT:    bgt cr0, .LBB17_2
+; NO-FAST-P9-NEXT:    bgt cr0, .LBB19_2
 ; NO-FAST-P9-NEXT:  # %bb.1: # %entry
 ; NO-FAST-P9-NEXT:    fmr f3, f4
-; NO-FAST-P9-NEXT:  .LBB17_2: # %entry
+; NO-FAST-P9-NEXT:  .LBB19_2: # %entry
 ; NO-FAST-P9-NEXT:    fmr f1, f3
 ; NO-FAST-P9-NEXT:    blr
 entry:
@@ -780,10 +866,10 @@ define float @select_ole_float(float %a, float %b, float %c, float %d) {
 ; NO-FAST-P8:       # %bb.0: # %entry
 ; NO-FAST-P8-NEXT:    fcmpu cr0, f1, f2
 ; NO-FAST-P8-NEXT:    crnor 4*cr5+lt, un, gt
-; NO-FAST-P8-NEXT:    bc 12, 4*cr5+lt, .LBB20_2
+; NO-FAST-P8-NEXT:    bc 12, 4*cr5+lt, .LBB22_2
 ; NO-FAST-P8-NEXT:  # %bb.1: # %entry
 ; NO-FAST-P8-NEXT:    fmr f3, f4
-; NO-FAST-P8-NEXT:  .LBB20_2: # %entry
+; NO-FAST-P8-NEXT:  .LBB22_2: # %entry
 ; NO-FAST-P8-NEXT:    fmr f1, f3
 ; NO-FAST-P8-NEXT:    blr
 ;
@@ -791,10 +877,10 @@ define float @select_ole_float(float %a, float %b, float %c, float %d) {
 ; NO-FAST-P9:       # %bb.0: # %entry
 ; NO-FAST-P9-NEXT:    fcmpu cr0, f1, f2
 ; NO-FAST-P9-NEXT:    crnor 4*cr5+lt, un, gt
-; NO-FAST-P9-NEXT:    bc 12, 4*cr5+lt, .LBB20_2
+; NO-FAST-P9-NEXT:    bc 12, 4*cr5+lt, .LBB22_2
 ; NO-FAST-P9-NEXT:  # %bb.1: # %entry
 ; NO-FAST-P9-NEXT:    fmr f3, f4
-; NO-FAST-P9-NEXT:  .LBB20_2: # %entry
+; NO-FAST-P9-NEXT:  .LBB22_2: # %entry
 ; NO-FAST-P9-NEXT:    fmr f1, f3
 ; NO-FAST-P9-NEXT:    blr
 entry:
@@ -820,10 +906,10 @@ define double @select_ole_double(double %a, double %b, double %c, double %d) {
 ; NO-FAST-P8:       # %bb.0: # %entry
 ; NO-FAST-P8-NEXT:    fcmpu cr0, f1, f2
 ; NO-FAST-P8-NEXT:    crnor 4*cr5+lt, un, gt
-; NO-FAST-P8-NEXT:    bc 12, 4*cr5+lt, .LBB21_2
+; NO-FAST-P8-NEXT:    bc 12, 4*cr5+lt, .LBB23_2
 ; NO-FAST-P8-NEXT:  # %bb.1: # %entry
 ; NO-FAST-P8-NEXT:    fmr f3, f4
-; NO-FAST-P8-NEXT:  .LBB21_2: # %entry
+; NO-FAST-P8-NEXT:  .LBB23_2: # %entry
 ; NO-FAST-P8-NEXT:    fmr f1, f3
 ; NO-FAST-P8-NEXT:    blr
 ;
@@ -831,10 +917,10 @@ define double @select_ole_double(double %a, double %b, double %c, double %d) {
 ; NO-FAST-P9:       # %bb.0: # %entry
 ; NO-FAST-P9-NEXT:    fcmpu cr0, f1, f2
 ; NO-FAST-P9-NEXT:    crnor 4*cr5+lt, un, gt
-; NO-FAST-P9-NEXT:    bc 12, 4*cr5+lt, .LBB21_2
+; NO-FAST-P9-NEXT:    bc 12, 4*cr5+lt, .LBB23_2
 ; NO-FAST-P9-NEXT:  # %bb.1: # %entry
 ; NO-FAST-P9-NEXT:    fmr f3, f4
-; NO-FAST-P9-NEXT:  .LBB21_2: # %entry
+; NO-FAST-P9-NEXT:  .LBB23_2: # %entry
 ; NO-FAST-P9-NEXT:    fmr f1, f3
 ; NO-FAST-P9-NEXT:    blr
 entry:
@@ -926,13 +1012,13 @@ define double @onecmp1(double %a, double %y, double %z) {
 ; NO-FAST-P8-NEXT:    vspltisw v2, 1
 ; NO-FAST-P8-NEXT:    xvcvsxwdp vs0, vs34
 ; NO-FAST-P8-NEXT:    fcmpu cr0, f1, f0
-; NO-FAST-P8-NEXT:    bc 12, lt, .LBB24_3
+; NO-FAST-P8-NEXT:    bc 12, lt, .LBB26_3
 ; NO-FAST-P8-NEXT:  # %bb.1: # %entry
 ; NO-FAST-P8-NEXT:    fcmpu cr0, f1, f1
-; NO-FAST-P8-NEXT:    bc 12, un, .LBB24_3
+; NO-FAST-P8-NEXT:    bc 12, un, .LBB26_3
 ; NO-FAST-P8-NEXT:  # %bb.2: # %entry
 ; NO-FAST-P8-NEXT:    fmr f3, f2
-; NO-FAST-P8-NEXT:  .LBB24_3: # %entry
+; NO-FAST-P8-NEXT:  .LBB26_3: # %entry
 ; NO-FAST-P8-NEXT:    fmr f1, f3
 ; NO-FAST-P8-NEXT:    blr
 ;
@@ -941,13 +1027,13 @@ define double @onecmp1(double %a, double %y, double %z) {
 ; NO-FAST-P9-NEXT:    vspltisw v2, 1
 ; NO-FAST-P9-NEXT:    xvcvsxwdp vs0, vs34
 ; NO-FAST-P9-NEXT:    fcmpu cr0, f1, f0
-; NO-FAST-P9-NEXT:    bc 12, lt, .LBB24_3
+; NO-FAST-P9-NEXT:    bc 12, lt, .LBB26_3
 ; NO-FAST-P9-NEXT:  # %bb.1: # %entry
 ; NO-FAST-P9-NEXT:    fcmpu cr0, f1, f1
-; NO-FAST-P9-NEXT:    bc 12, un, .LBB24_3
+; NO-FAST-P9-NEXT:    bc 12, un, .LBB26_3
 ; NO-FAST-P9-NEXT:  # %bb.2: # %entry
 ; NO-FAST-P9-NEXT:    fmr f3, f2
-; NO-FAST-P9-NEXT:  .LBB24_3: # %entry
+; NO-FAST-P9-NEXT:  .LBB26_3: # %entry
 ; NO-FAST-P9-NEXT:    fmr f1, f3
 ; NO-FAST-P9-NEXT:    blr
 entry:
@@ -978,10 +1064,10 @@ define double @onecmp2(double %a, double %y, double %z) {
 ; NO-FAST-P8-NEXT:    vspltisw v2, 1
 ; NO-FAST-P8-NEXT:    xvcvsxwdp vs0, vs34
 ; NO-FAST-P8-NEXT:    xscmpudp cr0, f1, f0
-; NO-FAST-P8-NEXT:    bgt cr0, .LBB25_2
+; NO-FAST-P8-NEXT:    bgt cr0, .LBB27_2
 ; NO-FAST-P8-NEXT:  # %bb.1: # %entry
 ; NO-FAST-P8-NEXT:    fmr f2, f3
-; NO-FAST-P8-NEXT:  .LBB25_2: # %entry
+; NO-FAST-P8-NEXT:  .LBB27_2: # %entry
 ; NO-FAST-P8-NEXT:    fmr f1, f2
 ; NO-FAST-P8-NEXT:    blr
 ;
@@ -990,10 +1076,10 @@ define double @onecmp2(double %a, double %y, double %z) {
 ; NO-FAST-P9-NEXT:    vspltisw v2, 1
 ; NO-FAST-P9-NEXT:    xvcvsxwdp vs0, vs34
 ; NO-FAST-P9-NEXT:    xscmpudp cr0, f1, f0
-; NO-FAST-P9-NEXT:    bgt cr0, .LBB25_2
+; NO-FAST-P9-NEXT:    bgt cr0, .LBB27_2
 ; NO-FAST-P9-NEXT:  # %bb.1: # %entry
 ; NO-FAST-P9-NEXT:    fmr f2, f3
-; NO-FAST-P9-NEXT:  .LBB25_2: # %entry
+; NO-FAST-P9-NEXT:  .LBB27_2: # %entry
 ; NO-FAST-P9-NEXT:    fmr f1, f2
 ; NO-FAST-P9-NEXT:    blr
 entry:
@@ -1028,10 +1114,10 @@ define double @onecmp3(double %a, double %y, double %z) {
 ; NO-FAST-P8-NEXT:    vspltisw v2, 1
 ; NO-FAST-P8-NEXT:    xvcvsxwdp vs0, vs34
 ; NO-FAST-P8-NEXT:    xscmpudp cr0, f1, f0
-; NO-FAST-P8-NEXT:    beq cr0, .LBB26_2
+; NO-FAST-P8-NEXT:    beq cr0, .LBB28_2
 ; NO-FAST-P8-NEXT:  # %bb.1: # %entry
 ; NO-FAST-P8-NEXT:    fmr f2, f3
-; NO-FAST-P8-NEXT:  .LBB26_2: # %entry
+; NO-FAST-P8-NEXT:  .LBB28_2: # %entry
 ; NO-FAST-P8-NEXT:    fmr f1, f2
 ; NO-FAST-P8-NEXT:    blr
 ;
@@ -1040,10 +1126,10 @@ define double @onecmp3(double %a, double %y, double %z) {
 ; NO-FAST-P9-NEXT:    vspltisw v2, 1
 ; NO-FAST-P9-NEXT:    xvcvsxwdp vs0, vs34
 ; NO-FAST-P9-NEXT:    xscmpudp cr0, f1, f0
-; NO-FAST-P9-NEXT:    beq cr0, .LBB26_2
+; NO-FAST-P9-NEXT:    beq cr0, .LBB28_2
 ; NO-FAST-P9-NEXT:  # %bb.1: # %entry
 ; NO-FAST-P9-NEXT:    fmr f2, f3
-; NO-FAST-P9-NEXT:  .LBB26_2: # %entry
+; NO-FAST-P9-NEXT:  .LBB28_2: # %entry
 ; NO-FAST-P9-NEXT:    fmr f1, f2
 ; NO-FAST-P9-NEXT:    blr
 entry:
diff --git a/llvm/test/CodeGen/RISCV/and-negpow2-cmp.ll b/llvm/test/CodeGen/RISCV/and-negpow2-cmp.ll
index 55b0d1f..2a46a59 100644
--- a/llvm/test/CodeGen/RISCV/and-negpow2-cmp.ll
+++ b/llvm/test/CodeGen/RISCV/and-negpow2-cmp.ll
@@ -155,3 +155,109 @@ define i1 @test9(i64 %x) {
   %b = icmp eq i64 %a, u0x08000000
   ret i1 %b
 }
+
+; Make sure the and constant doesn't get converted to an opaque constant by
+; ConstantHoisting. If it's an opaque constant, we'll have addi -16 and addi 15.
+define i64 @test10(i64 %0) #0 {
+; RV32-LABEL: test10:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    addi a0, a0, -1
+; RV32-NEXT:    andi a0, a0, -16
+; RV32-NEXT:    snez a0, a0
+; RV32-NEXT:    li a1, 0
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: test10:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    addi a0, a0, -1
+; RV64-NEXT:    sraiw a0, a0, 4
+; RV64-NEXT:    snez a0, a0
+; RV64-NEXT:    ret
+entry:
+  %1 = add nuw nsw i64 %0, u0xffffffff
+  %2 = and i64 %1, u0xfffffff0
+  %3 = icmp ne i64 %2, 0
+  %4 = zext i1 %3 to i64
+  ret i64 %4
+}
+
+; Make sure the and constant doesn't get converted to an opaque constant by
+; ConstantHoisting. If it's an opaque constant, we'll have addi -16 and addi 15.
+define i64 @test11(i64 %0) #0 {
+; RV32-LABEL: test11:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    addi a0, a0, -1
+; RV32-NEXT:    srai a0, a0, 4
+; RV32-NEXT:    addi a0, a0, 1621
+; RV32-NEXT:    seqz a0, a0
+; RV32-NEXT:    li a1, 0
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: test11:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    addi a0, a0, -1
+; RV64-NEXT:    sraiw a0, a0, 4
+; RV64-NEXT:    addi a0, a0, 1621
+; RV64-NEXT:    seqz a0, a0
+; RV64-NEXT:    ret
+entry:
+  %1 = add nuw nsw i64 %0, u0xffffffff
+  %2 = and i64 %1, u0xfffffff0
+  %3 = icmp eq i64 %2, u0xffff9ab0
+  %4 = zext i1 %3 to i64
+  ret i64 %4
+}
+
+; Make sure the and constant doesn't get converted to an opaque constant by
+; ConstantHoisting. If it's an opaque constant we'll end up with constant
+; materialization sequences on RV64.
+define i64 @test12(i64 %0) #0 {
+; RV32-LABEL: test12:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    addi a0, a0, -3
+; RV32-NEXT:    seqz a0, a0
+; RV32-NEXT:    li a1, 0
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: test12:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    addiw a0, a0, -16
+; RV64-NEXT:    addi a0, a0, 13
+; RV64-NEXT:    seqz a0, a0
+; RV64-NEXT:    ret
+entry:
+  %1 = add nuw nsw i64 %0, u0xfffffff0
+  %2 = and i64 %1, u0xffffffff
+  %3 = icmp eq i64 %2, u0xfffffff3
+  %4 = zext i1 %3 to i64
+  ret i64 %4
+}
+
+; Make sure the and constant doesn't get converted to an opaque constant by
+; ConstantHoisting.
+define i64 @test13(i64 %0) #0 {
+; RV32-LABEL: test13:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    lui a1, 524288
+; RV32-NEXT:    addi a1, a1, 15
+; RV32-NEXT:    add a0, a0, a1
+; RV32-NEXT:    srli a0, a0, 31
+; RV32-NEXT:    seqz a0, a0
+; RV32-NEXT:    li a1, 0
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: test13:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    lui a1, 524288
+; RV64-NEXT:    addi a1, a1, -15
+; RV64-NEXT:    sub a0, a0, a1
+; RV64-NEXT:    sraiw a0, a0, 31
+; RV64-NEXT:    seqz a0, a0
+; RV64-NEXT:    ret
+entry:
+  %1 = add nuw nsw i64 %0, u0x8000000f
+  %2 = and i64 %1, u0x80000000
+  %3 = icmp eq i64 %2, 0
+  %4 = zext i1 %3 to i64
+  ret i64 %4
+}
diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_EXT_relaxed_printf_string_address_space/builtin_printf.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_EXT_relaxed_printf_string_address_space/builtin_printf.ll
new file mode 100644
index 0000000..093d172
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_EXT_relaxed_printf_string_address_space/builtin_printf.ll
@@ -0,0 +1,24 @@
+; RUN: llc -O0 -mtriple=spirv32-unknown-unknown --spirv-ext=+SPV_EXT_relaxed_printf_string_address_space %s -o - | FileCheck %s
+; RUN: not llc -O0 -mtriple=spirv32-unknown-unknown %s -o %t.spvt 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR
+
+; CHECK: OpExtension "SPV_EXT_relaxed_printf_string_address_space"
+; CHECK: %[[#]] = OpExtInst %[[#]] %[[#]] printf
+
+; CHECK-ERROR: LLVM ERROR: SPV_EXT_relaxed_printf_string_address_space is required because printf uses a format string not in constant address space.
+
+@.str = private unnamed_addr addrspace(1) constant [4 x i8] c"%d\0A\00", align 1
+
+declare spir_func i32 @printf(ptr addrspace(4), ...)
+
+define spir_kernel void @test_kernel() {
+entry:
+  ; Format string in addrspace(1) → cast to addrspace(4)
+  %format = addrspacecast ptr addrspace(1) @.str to ptr addrspace(4)
+  %val = alloca i32, align 4
+  store i32 123, ptr %val, align 4
+  %loaded = load i32, ptr %val, align 4
+
+  ; Call printf with non-constant format string
+  %call = call spir_func i32 (ptr addrspace(4), ...) @printf(ptr addrspace(4) %format, i32 %loaded)
+  ret void
+}
diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_EXT_relaxed_printf_string_address_space/non-constant-printf.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_EXT_relaxed_printf_string_address_space/non-constant-printf.ll
new file mode 100644
index 0000000..b54d59b
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_EXT_relaxed_printf_string_address_space/non-constant-printf.ll
@@ -0,0 +1,48 @@
+; RUN: llc -O0 -mtriple=spirv32-unknown-unknown --spirv-ext=+SPV_EXT_relaxed_printf_string_address_space %s -o - | FileCheck %s
+; RUN: not llc -O0 -mtriple=spirv32-unknown-unknown %s -o %t.spvt 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR
+
+; CHECK: OpExtension "SPV_EXT_relaxed_printf_string_address_space"
+; CHECK: %[[#ExtInstSetId:]] = OpExtInstImport "OpenCL.std"
+; CHECK-DAG: %[[#TypeInt32Id:]] = OpTypeInt 32 0
+; CHECK-DAG: %[[#TypeInt8Id:]] = OpTypeInt 8 0
+; CHECK-DAG: %[[#TypeInt64Id:]] = OpTypeInt 64 0
+; CHECK-DAG: %[[#TypeArrayId:]] = OpTypeArray %[[#TypeInt8Id]] %[[#]]
+; CHECK-DAG: %[[#ConstantStorClassGlobalPtrTy:]] = OpTypePointer UniformConstant %[[#TypeArrayId]]
+; CHECK-DAG: %[[#WGStorClassGlobalPtrTy:]] = OpTypePointer Workgroup %[[#TypeArrayId]]
+; CHECK-DAG: %[[#CrossWFStorClassGlobalPtrTy:]] = OpTypePointer CrossWorkgroup %[[#TypeArrayId]]
+; CHECK-DAG: %[[#FunctionStorClassPtrTy:]] = OpTypePointer Function %[[#TypeInt8Id]]
+; CHECK-DAG: %[[#WGStorClassPtrTy:]] = OpTypePointer Workgroup %[[#TypeInt8Id]]
+; CHECK-DAG: %[[#CrossWFStorClassPtrTy:]] = OpTypePointer CrossWorkgroup %[[#TypeInt8Id]]
+; CHECK: %[[#ConstantCompositeId:]] = OpConstantComposite %[[#TypeArrayId]] %[[#]] %[[#]] %[[#]] %[[#]] %[[#]] %[[#]]
+; CHECK: %[[#]] = OpVariable %[[#ConstantStorClassGlobalPtrTy]] UniformConstant %[[#ConstantCompositeId]]
+; CHECK: %[[#]] = OpVariable %[[#CrossWFStorClassGlobalPtrTy]] CrossWorkgroup %[[#ConstantCompositeId]]
+; CHECK: %[[#]] = OpVariable %[[#WGStorClassGlobalPtrTy]] Workgroup %[[#ConstantCompositeId]]
+; CHECK: %[[#GEP1:]] = OpInBoundsPtrAccessChain %[[#FunctionStorClassPtrTy]] %[[#]] %[[#]] %[[#]]
+; CHECK: %[[#]] = OpExtInst %[[#TypeInt32Id]] %[[#ExtInstSetId:]] printf %[[#GEP1]]
+; CHECK: %[[#GEP2:]] = OpInBoundsPtrAccessChain %[[#CrossWFStorClassPtrTy]] %[[#]] %[[#]] %[[#]]
+; CHECK: %[[#]] = OpExtInst %[[#TypeInt32Id]] %[[#ExtInstSetId:]] printf %[[#GEP2]]
+; CHECK: %[[#GEP3:]] = OpInBoundsPtrAccessChain %[[#WGStorClassPtrTy]] %[[#]] %[[#]] %[[#]]
+; CHECK: %[[#]] = OpExtInst %[[#TypeInt32Id]] %[[#ExtInstSetId:]] printf %[[#GEP3]]
+
+; CHECK-ERROR: LLVM ERROR: SPV_EXT_relaxed_printf_string_address_space is required because printf uses a format string not in constant address space.
+
+@0 = internal unnamed_addr addrspace(2) constant [6 x i8] c"Test\0A\00", align 1
+@1 = internal unnamed_addr addrspace(1) constant [6 x i8] c"Test\0A\00", align 1
+@2 = internal unnamed_addr addrspace(3) constant [6 x i8] c"Test\0A\00", align 1
+
+define spir_kernel void @test() {
+  %tmp1 = alloca [6 x i8], align 1
+  call void @llvm.memcpy.p0.p2.i64(ptr align 1 %tmp1, ptr addrspace(2) align 1 @0, i64 6, i1 false)
+  %1 = getelementptr inbounds [6 x i8], ptr %tmp1, i32 0, i32 0
+  %2 = call spir_func i32 @_Z18__spirv_ocl_printfPc(ptr %1)
+  %3 = getelementptr inbounds [6 x i8], ptr addrspace(1) @1, i32 0, i32 0
+  %4 = call spir_func i32 @_Z18__spirv_ocl_printfPU3AS1c(ptr addrspace(1) %3)
+  %5 = getelementptr inbounds [6 x i8], ptr addrspace(3) @2, i32 0, i32 0
+  %6 = call spir_func i32 @_Z18__spirv_ocl_printfPU3AS3c(ptr addrspace(3) %5)
+  ret void
+}
+
+declare spir_func i32 @_Z18__spirv_ocl_printfPc(ptr)
+declare spir_func i32 @_Z18__spirv_ocl_printfPU3AS1c(ptr addrspace(1))
+declare spir_func i32 @_Z18__spirv_ocl_printfPU3AS3c(ptr addrspace(3))
+declare void @llvm.memcpy.p0.p2.i64(ptr captures(none), ptr addrspace(2) captures(none) readonly, i64, i1)
diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_bindless_images/i32-in-physical64.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_bindless_images/i32-in-physical64.ll
new file mode 100644
index 0000000..3624f14
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_bindless_images/i32-in-physical64.ll
@@ -0,0 +1,19 @@
+; RUN: not llc -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_INTEL_bindless_images %s -o %t.spvt 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR
+
+; CHECK-ERROR: LLVM ERROR: Parameter value must be a 32-bit scalar in case of Physical32 addressing model or a 64-bit scalar in case of Physical64 addressing model
+
+target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-n8:16:32:64"
+target triple = "spir64-unknown-unknown"
+
+define spir_func void @foo(i32 %in) {
+  %img = call spir_func target("spirv.Image", i32, 2, 0, 0, 0, 0, 0, 0) @_Z33__spirv_ConvertHandleToImageINTELi(i32 %in)
+  %samp = call spir_func target("spirv.Sampler") @_Z35__spirv_ConvertHandleToSamplerINTELl(i64 42)
+  %sampImage = call spir_func target("spirv.SampledImage", i64, 1, 0, 0, 0, 0, 0, 0) @_Z40__spirv_ConvertHandleToSampledImageINTELl(i64 43)
+  ret void
+}
+
+declare spir_func target("spirv.Image", i32, 2, 0, 0, 0, 0, 0, 0) @_Z33__spirv_ConvertHandleToImageINTELi(i32)
+
+declare spir_func target("spirv.Sampler") @_Z35__spirv_ConvertHandleToSamplerINTELl(i64)
+
+declare spir_func target("spirv.SampledImage", i64, 1, 0, 0, 0, 0, 0, 0) @_Z40__spirv_ConvertHandleToSampledImageINTELl(i64)
diff --git a/llvm/test/CodeGen/SPIRV/image_store.ll b/llvm/test/CodeGen/SPIRV/image_store.ll
new file mode 100644
index 0000000..a70651c
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/image_store.ll
@@ -0,0 +1,22 @@
+; RUN: llc -O0 -verify-machineinstrs -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+; Image types may be represented in two ways while translating to SPIR-V:
+; - OpenCL form, for example, '%opencl.image2d_ro_t',
+; - SPIR-V form, for example, '%spirv.Image._void_1_0_0_0_0_0_0',
+; but it is still one type which should be translated to one SPIR-V type.
+;
+; The test checks that the code below is successfully translated and only one
+; SPIR-V type for images is generated (no duplicate OpTypeImage instructions).
+
+; CHECK:     %[[#]] = OpTypeImage %[[#]] 2D
+; CHECK-NOT: %[[#]] = OpTypeImage %[[#]] 2D
+
+declare spir_func <4 x float> @_Z11read_imagef14ocl_image2d_ro11ocl_samplerDv2_ff(ptr addrspace(1), ptr addrspace(2), <2 x float>, float)
+
+define spir_kernel void @read_image(ptr addrspace(1) %srcimg, ptr addrspace(2) %sampler){
+entry:
+  %spirvimg.addr = alloca target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 0), align 8
+  %val = call <4 x float> @_Z11read_imagef14ocl_image2d_ro11ocl_samplerDv2_ff(ptr addrspace(1) %srcimg, ptr addrspace(2) %sampler, <2 x float> zeroinitializer, float 0.0)
+  ret void
+}
diff --git a/llvm/test/CodeGen/SPIRV/instructions/insertvalue-undef-ptr.ll b/llvm/test/CodeGen/SPIRV/instructions/insertvalue-undef-ptr.ll
new file mode 100644
index 0000000..b788f34b
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/instructions/insertvalue-undef-ptr.ll
@@ -0,0 +1,28 @@
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+; CHECK-LABEL: Begin function original_testcase
+define fastcc void @original_testcase() {
+top:
+  ; CHECK: OpCompositeInsert
+  %0 = insertvalue [1 x ptr] zeroinitializer, ptr poison, 0
+  ret void
+}
+
+; CHECK-LABEL: Begin function additional_testcases
+define fastcc void @additional_testcases() {
+top:
+  ; Test with different pointer types
+  ; CHECK: OpCompositeInsert
+  %1 = insertvalue [1 x ptr] zeroinitializer, ptr undef, 0
+  ; CHECK-NEXT: OpCompositeInsert
+  %2 = insertvalue {ptr, i32} zeroinitializer, ptr poison, 0
+  ; CHECK-NEXT: OpCompositeInsert
+  %3 = insertvalue {ptr, ptr} undef, ptr null, 0
+
+  ; Test with undef aggregate
+  ; CHECK-NEXT: OpCompositeInsert
+  %4 = insertvalue [1 x ptr] undef, ptr undef, 0
+
+  ret void
+}
diff --git a/llvm/test/CodeGen/SPIRV/llvm-intrinsics/constrained-comparison.ll b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/constrained-comparison.ll
new file mode 100644
index 0000000..49bb8ea
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/constrained-comparison.ll
@@ -0,0 +1,56 @@
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+; CHECK-DAG: OpFOrdEqual
+; CHECK-DAG: OpFOrdGreaterThan
+; CHECK-DAG: OpFOrdGreaterThanEqual
+; CHECK-DAG: OpFOrdLessThan
+; CHECK-DAG: OpFOrdLessThanEqual
+; CHECK-DAG: OpFOrdNotEqual
+; CHECK-DAG: OpOrdered
+; CHECK-DAG: OpFUnordEqual
+; CHECK-DAG: OpFUnordGreaterThan
+; CHECK-DAG: OpFUnordGreaterThanEqual
+; CHECK-DAG: OpFUnordLessThan
+; CHECK-DAG: OpFUnordLessThanEqual
+; CHECK-DAG: OpFUnordNotEqual
+; CHECK-DAG: OpUnordered
+
+define dso_local spir_kernel void @test(float %a){
+entry:
+  %cmp = tail call i1 @llvm.experimental.constrained.fcmps.f32(float %a, float %a, metadata !"oeq", metadata !"fpexcept.strict") 
+  %cmp1 = tail call i1 @llvm.experimental.constrained.fcmps.f32(float %a, float %a, metadata !"ogt", metadata !"fpexcept.strict") 
+  %cmp2 = tail call i1 @llvm.experimental.constrained.fcmps.f32(float %a, float %a, metadata !"oge", metadata !"fpexcept.strict") 
+  %cmp3 = tail call i1 @llvm.experimental.constrained.fcmp.f32(float %a, float %a, metadata !"olt", metadata !"fpexcept.strict") 
+  %cmp4 = tail call i1 @llvm.experimental.constrained.fcmp.f32(float %a, float %a, metadata !"ole", metadata !"fpexcept.strict") 
+  %cmp5 = tail call i1 @llvm.experimental.constrained.fcmp.f32(float %a, float %a, metadata !"one", metadata !"fpexcept.strict") 
+  %cmp6 = tail call i1 @llvm.experimental.constrained.fcmp.f32(float %a, float %a, metadata !"ord", metadata !"fpexcept.strict") 
+  %cmp7 = tail call i1 @llvm.experimental.constrained.fcmp.f32(float %a, float %a, metadata !"ueq", metadata !"fpexcept.strict") 
+  %cmp8 = tail call i1 @llvm.experimental.constrained.fcmp.f32(float %a, float %a, metadata !"ugt", metadata !"fpexcept.strict") 
+  %cmp9 = tail call i1 @llvm.experimental.constrained.fcmp.f32(float %a, float %a, metadata !"uge", metadata !"fpexcept.strict") 
+  %cmp10 = tail call i1 @llvm.experimental.constrained.fcmp.f32(float %a, float %a, metadata !"ult", metadata !"fpexcept.strict") 
+  %cmp11 = tail call i1 @llvm.experimental.constrained.fcmp.f32(float %a, float %a, metadata !"ule", metadata !"fpexcept.strict") 
+  %cmp12 = tail call i1 @llvm.experimental.constrained.fcmp.f32(float %a, float %a, metadata !"une", metadata !"fpexcept.strict") 
+  %cmp13 = tail call i1 @llvm.experimental.constrained.fcmp.f32(float %a, float %a, metadata !"uno", metadata !"fpexcept.strict") 
+
+  %or1 = or i1 %cmp, %cmp1
+  %or2 = or i1 %or1, %cmp2
+  %or3 = or i1 %or2, %cmp3
+  %or4 = or i1 %or3, %cmp4
+  %or5 = or i1 %or4, %cmp5
+  %or6 = or i1 %or5, %cmp6
+  %or7 = or i1 %or6, %cmp7
+  %or8 = or i1 %or7, %cmp8
+  %or9 = or i1 %or8, %cmp9
+  %or10 = or i1 %or9, %cmp10
+  %or11 = or i1 %or10, %cmp11
+  %or12 = or i1 %or11, %cmp12
+  %or13 = or i1 %or12, %cmp13
+  br i1 %or13, label %true_block, label %false_block
+true_block:
+  ret void
+false_block:
+  ret void
+}
+declare i1 @llvm.experimental.constrained.fcmps.f32(float, float, metadata, metadata) 
+declare i1 @llvm.experimental.constrained.fcmp.f32(float, float, metadata, metadata) 
diff --git a/llvm/test/CodeGen/SPIRV/llvm-intrinsics/debugtrap.ll b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/debugtrap.ll
new file mode 100644
index 0000000..fd8cb9d
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/debugtrap.ll
@@ -0,0 +1,14 @@
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s
+
+; CHECK: OpNop
+; CHECK-NEXT: OpReturn
+
+declare void @llvm.debugtrap()
+
+define spir_kernel void @foo(ptr addrspace(1) %a){
+entry:
+  %a.addr = alloca ptr addrspace(1), align 4
+  store ptr addrspace(1) %a, ptr %a.addr, align 4
+  call void @llvm.debugtrap()
+  ret void
+}
diff --git a/llvm/test/CodeGen/SPIRV/llvm-intrinsics/frexp.ll b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/frexp.ll
new file mode 100644
index 0000000..f6434e9
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/frexp.ll
@@ -0,0 +1,114 @@
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+; CHECK-DAG: %[[#extinst_id:]] = OpExtInstImport "OpenCL.std"
+; CHECK-DAG: %[[#float_32_type:]] = OpTypeFloat 32
+; CHECK-DAG: %[[#int_32_type:]] = OpTypeInt 32 0
+; CHECK-DAG: %[[#fn_ptr_type_i32:]] = OpTypePointer Function %[[#int_32_type]]
+; CHECK-DAG: %[[#const_negzero:]] = OpConstant %[[#float_32_type]] -0
+; CHECK-DAG: %[[#vec2_float_type:]] = OpTypeVector %[[#float_32_type]] 2
+; CHECK-DAG: %[[#vec2_int_type:]] = OpTypeVector %[[#int_32_type]] 2
+; CHECK-DAG: %[[#fn_ptr_type_vec2_i32:]] = OpTypePointer Function %[[#vec2_int_type]]
+; CHECK-DAG: %[[#vec2_null:]] = OpConstantNull %[[#vec2_float_type]]
+; CHECK-DAG: %[[#scalar_null:]] = OpConstantNull %[[#float_32_type]]
+; CHECK-DAG: %[[#const_composite1:]] = OpConstantComposite %[[#vec2_float_type]] %[[#scalar_null]] %[[#const_negzero]]
+; CHECK-DAG: %[[#vec4_float_type:]] = OpTypeVector %[[#float_32_type]] 4
+; CHECK-DAG: %[[#vec4_int_type:]] = OpTypeVector %[[#int_32_type]] 4
+; CHECK-DAG: %[[#fn_ptr_type_vec4_i32:]] = OpTypePointer Function %[[#vec4_int_type]]
+; CHECK-DAG: %[[#const_composite2:]] = OpConstantComposite %[[#vec4_float_type]] %[[#const_16:]] %[[#const_neg32:]] %[[#const_0:]] %[[#const_9999:]]
+; CHECK-DAG: %[[#float_64_type:]] = OpTypeFloat 64
+; CHECK-DAG: %[[#vec2_double_type:]] = OpTypeVector %[[#float_64_type]] 2
+
+; CHECK: %[[#]] = OpFunctionParameter %[[#float_32_type]]
+; CHECK: %[[#var1:]] = OpVariable %[[#fn_ptr_type_i32]] Function
+; CHECK: %[[#extinst1:]] = OpExtInst %[[#float_32_type]] %[[#extinst_id]] frexp %[[#const_negzero]] %[[#var1]]
+; CHECK: %[[#exp_part_var:]] = OpLoad %[[#int_32_type]] %[[#var1]]
+; CHECK: OpReturnValue %[[#exp_part_var]]
+define i32 @frexp_negzero(float %x) {
+  %ret = call { float, i32 } @llvm.frexp.f32.i32(float -0.0)
+  %f_part = extractvalue { float, i32 } %ret, 0
+  %exp_part = extractvalue { float, i32 } %ret, 1
+  ret i32 %exp_part
+}
+
+; CHECK: %[[#x_var4:]] = OpFunctionParameter %[[#float_32_type]]
+; CHECK: %[[#var10:]] = OpVariable %[[#fn_ptr_type_i32]] Function
+; CHECK: %[[#extinst10:]] = OpExtInst %[[#float_32_type]] %[[#extinst_id]] frexp %[[#x_var4]] %[[#var10]]
+; CHECK: %[[#exp_part_var2:]] = OpLoad %[[#int_32_type]] %[[#var10]]
+; CHECK: OpReturnValue %[[#exp_part_var2]]
+define i32 @frexp_frexp_get_int(float %x) {
+  %frexp0 = call { float, i32 } @llvm.frexp.f32.i32(float %x)
+  %f_part = extractvalue { float, i32 } %frexp0, 0
+  %exp_part = extractvalue { float, i32 } %frexp0, 1
+  ret i32 %exp_part
+}
+
+; CHECK: %[[#var3:]] = OpVariable %[[#fn_ptr_type_vec2_i32]] Function
+; CHECK: %[[#extinst3:]] = OpExtInst %[[#vec2_float_type]] %[[#extinst_id]] frexp %[[#vec2_null]] %[[#var3]]
+; CHECK: %[[#f_part_var2:]] = OpLoad %[[#vec2_int_type]] %[[#var3]]
+; CHECK: OpReturnValue %[[#extinst3]]
+define <2 x float> @frexp_zero_vector() {
+  %ret = call { <2 x float>, <2 x i32> } @llvm.frexp.v2f32.v2i32(<2 x float> zeroinitializer)
+  %f_part = extractvalue { <2 x float>, <2 x i32> } %ret, 0
+  %exp_part = extractvalue { <2 x float>, <2 x i32> } %ret, 1
+  ret <2 x float> %f_part
+}
+
+; CHECK: %[[#var4:]] = OpVariable %[[#fn_ptr_type_vec2_i32]] Function
+; CHECK: %[[#extinst4:]] = OpExtInst %[[#vec2_float_type]] %[[#extinst_id]] frexp %[[#const_composite1]] %[[#var4]]
+; CHECK: %[[#f_part_var3:]] = OpLoad %[[#vec2_int_type]] %[[#var4]]
+; CHECK: OpReturnValue %[[#extinst4]]
+define <2 x float> @frexp_zero_negzero_vector() {
+  %ret = call { <2 x float>, <2 x i32> } @llvm.frexp.v2f32.v2i32(<2 x float> <float 0.0, float -0.0>)
+  %f_part = extractvalue { <2 x float>, <2 x i32> } %ret, 0
+  %exp_part = extractvalue { <2 x float>, <2 x i32> } %ret, 1
+  ret <2 x float> %f_part
+}
+
+; CHECK: %[[#var5:]] = OpVariable %[[#fn_ptr_type_vec4_i32]] Function
+; CHECK: %[[#extinst5:]] = OpExtInst %[[#vec4_float_type]] %[[#extinst_id]] frexp %[[#const_composite2]] %[[#var5]]
+; CHECK: %[[#f_part_var4:]] = OpLoad %[[#vec4_int_type]] %[[#var5]]
+; CHECK: OpReturnValue %[[#extinst5]]
+define <4 x float> @frexp_nonsplat_vector() {
+    %ret = call { <4 x float>, <4 x i32> } @llvm.frexp.v4f32.v4i32(<4 x float> <float 16.0, float -32.0, float 0.0, float 9999.0>)
+    %f_part = extractvalue { <4 x float>, <4 x i32> } %ret, 0
+    %exp_part = extractvalue { <4 x float>, <4 x i32> } %ret, 1
+  ret <4 x float> %f_part
+}
+
+; CHECK: %[[#x_var2:]] = OpFunctionParameter %[[#float_32_type]]
+; CHECK: %[[#var6:]] = OpVariable %[[#fn_ptr_type_i32]] Function
+; CHECK: %[[#var7:]] = OpVariable %[[#fn_ptr_type_i32]] Function
+; CHECK: %[[#extinst6:]] = OpExtInst %[[#float_32_type]] %[[#extinst_id]] frexp %[[#x_var2]] %[[#var6]]
+; CHECK: %[[#load1:]] = OpLoad %[[#int_32_type]] %[[#var6]]
+; CHECK: %[[#extinst7:]] = OpExtInst %[[#float_32_type]] %[[#extinst_id]] frexp %[[#extinst6]] %[[#var7]]
+; CHECK: %[[#f_part_var5:]] = OpLoad %[[#int_32_type]] %[[#var7]]
+; CHECK: OpReturnValue %[[#extinst7]]
+define float @frexp_frexp(float %x) {
+  %frexp0 = call { float, i32 } @llvm.frexp.f32.i32(float %x)
+  %frexp0_f_part = extractvalue { float, i32 } %frexp0, 0
+  %frexp0_exp_part = extractvalue { float, i32 } %frexp0, 1
+  %frexp1 = call { float, i32 } @llvm.frexp.f32.i32(float %frexp0_f_part)
+  %frexp1_f_part = extractvalue { float, i32 } %frexp1, 0
+  %frexp1_exp_part = extractvalue { float, i32 } %frexp1, 1
+  ret float %frexp1_f_part
+}
+
+; CHECK: %[[#x_var3:]] = OpFunctionParameter %[[#vec2_double_type]]
+; CHECK: %[[#var9:]] = OpVariable %[[#fn_ptr_type_vec2_i32]] Function
+; CHECK: %[[#extinst9:]] = OpExtInst %[[#vec2_double_type]] %[[#extinst_id]] frexp %[[#x_var3]] %[[#var9]]
+; CHECK: %[[#f_part_var6:]] = OpLoad %[[#vec2_int_type]] %[[#var9]]
+; CHECK: OpReturnValue %[[#extinst9]]
+define <2 x double> @frexp_frexp_vector(<2 x double> %x) {
+  %frexp0 = call { <2 x double>, <2 x i32> } @llvm.frexp.v2f64.v2i32(<2 x double> %x)
+  %f_part = extractvalue { <2 x double>, <2 x i32> } %frexp0, 0
+  %exp_part = extractvalue { <2 x double>, <2 x i32> } %frexp0, 1
+  ret <2 x double> %f_part
+}
+
+declare { float, i32 } @llvm.frexp.f32.i32(float)
+declare { double, i32 } @llvm.frexp.f64.i32(double)
+declare { <2 x float>, <2 x i32> } @llvm.frexp.v2f32.v2i32(<2 x float>)
+declare { <4 x float>, <4 x i32> } @llvm.frexp.v4f32.v4i32(<4 x float>)
+declare { <2 x double>, <2 x i32> } @llvm.frexp.v2f64.v2i32(<2 x double>)
+declare  { float, i8 } @llvm.frexp.f32.i8(float)
diff --git a/llvm/test/CodeGen/SPIRV/llvm-intrinsics/ignore-llvm-intrinsic.ll b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/ignore-llvm-intrinsic.ll
index a15a807..b3ef6d6 100644
--- a/llvm/test/CodeGen/SPIRV/llvm-intrinsics/ignore-llvm-intrinsic.ll
+++ b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/ignore-llvm-intrinsic.ll
@@ -11,7 +11,6 @@
 define spir_kernel void @foo(ptr %p) {
 entry:
   call void @llvm.trap()
-  call void @llvm.debugtrap()
   call void @llvm.ubsantrap(i8 100)
 
   %r1 = call ptr @llvm.invariant.start.p0(i64 1024, ptr %p)
diff --git a/llvm/test/CodeGen/SPIRV/llvm-intrinsics/memmove.ll b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/memmove.ll
new file mode 100644
index 0000000..51b7664
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/memmove.ll
@@ -0,0 +1,86 @@
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+; CHECK-SPIRV-NOT: llvm.memmove
+
+; CHECK-DAG: %[[#Int8:]] = OpTypeInt 8 0
+; CHECK-DAG: %[[#Int32:]] = OpTypeInt 32 0
+; CHECK-DAG: %[[#Int64:]] = OpTypeInt 64 0
+; CHECK-DAG: %[[#Ptr_CrossWG_8:]] = OpTypePointer CrossWorkgroup %[[#Int8]]
+; CHECK-DAG: %[[#Ptr_Generic_32:]] = OpTypePointer Generic %[[#Int32]]
+; CHECK-DAG: %[[#Const_64:]] = OpConstant %[[#Int32]] 64
+; CHECK-DAG: %[[#Const_36:]] = OpConstant %[[#Int32]] 36
+; CHECK-DAG: %[[#Const_30:]] = OpConstant %[[#Int32]] 30
+; CHECK-DAG: %[[#Const_32_64:]] = OpConstant %[[#Int64]] 32
+
+; CHECK: %[[#Param1:]] = OpFunctionParameter %[[#Ptr_CrossWG_8]]
+; CHECK: %[[#Param2:]] = OpFunctionParameter %[[#Ptr_CrossWG_8]]
+; CHECK: %[[#Size1:]] = OpUConvert %[[#Int64]] %[[#Const_64]]
+; CHECK: OpCopyMemorySized %[[#Param2]] %[[#Param1]] %[[#Size1]] Aligned 64
+
+; CHECK: %[[#Src:]] = OpFunctionParameter %[[#Ptr_CrossWG_8]]
+; CHECK: %[[#CastDst2:]] = OpGenericCastToPtr %[[#Ptr_CrossWG_8]] %[[#GenPtr:]]
+; CHECK: %[[#Size2:]] = OpUConvert %[[#Int64]] %[[#Const_36]]
+; CHECK: OpCopyMemorySized %[[#CastDst2]] %[[#Src]] %[[#Size2]] Aligned 64
+
+; CHECK: %[[#Param1:]] = OpFunctionParameter %[[#Ptr_CrossWG_8]]
+; CHECK: %[[#Param2:]] = OpFunctionParameter %[[#Ptr_CrossWG_8]]
+; CHECK: %[[#Size3:]] = OpUConvert %[[#Int64]] %[[#Const_30]]
+; CHECK: OpCopyMemorySized %[[#Param2]] %[[#Param1]] %[[#Size3]] Aligned 1
+
+; CHECK: %[[#Phi:]] = OpPhi %[[#Ptr_Generic_32]] %[[#Op1:]] %[[#Lbl1:]] %[[#Op2:]] %[[#Lbl2:]]
+; CHECK: %[[#Cast:]] = OpPtrCastToGeneric %[[#]] %[[#]]
+; CHECK: OpCopyMemorySized %[[#Cast]] %[[#Phi]] %[[#Const_32_64]] Aligned 8
+
+%struct.SomeStruct = type { <16 x float>, i32, [60 x i8] }
+%class.kfunc = type <{ i32, i32, i32, [4 x i8] }>
+
+@InvocIndex = external local_unnamed_addr addrspace(1) constant i64, align 8
+@"func_object1" = internal addrspace(3) global %class.kfunc zeroinitializer, align 8
+
+define spir_kernel void @test_full_move(%struct.SomeStruct addrspace(1)* captures(none) readonly %in, %struct.SomeStruct addrspace(1)* captures(none) %out) {
+  %1 = bitcast %struct.SomeStruct addrspace(1)* %in to i8 addrspace(1)*
+  %2 = bitcast %struct.SomeStruct addrspace(1)* %out to i8 addrspace(1)*
+  call void @llvm.memmove.p1i8.p1i8.i32(i8 addrspace(1)* align 64 %2, i8 addrspace(1)* align 64 %1, i32 64, i1 false)
+  ret void
+}
+
+define spir_kernel void @test_partial_move(%struct.SomeStruct addrspace(1)* captures(none) readonly %in, %struct.SomeStruct addrspace(4)* captures(none) %out) {
+  %1 = bitcast %struct.SomeStruct addrspace(1)* %in to i8 addrspace(1)*
+  %2 = bitcast %struct.SomeStruct addrspace(4)* %out to i8 addrspace(4)*
+  %3 = addrspacecast i8 addrspace(4)* %2 to i8 addrspace(1)*
+  call void @llvm.memmove.p1i8.p1i8.i32(i8 addrspace(1)* align 64 %3, i8 addrspace(1)* align 64 %1, i32 36, i1 false)
+  ret void
+}
+
+define spir_kernel void @test_array(i8 addrspace(1)* %in, i8 addrspace(1)* %out) {
+  call void @llvm.memmove.p1i8.p1i8.i32(i8 addrspace(1)* %out, i8 addrspace(1)* %in, i32 30, i1 false)
+  ret void
+}
+
+define weak_odr dso_local spir_kernel void @test_phi() local_unnamed_addr {
+entry:
+  %0 = alloca i32, align 8
+  %1 = addrspacecast i32* %0 to i32 addrspace(4)*
+  %2 = load i64, i64 addrspace(1)* @InvocIndex, align 8
+  %cmp = icmp eq i64 %2, 0
+  br i1 %cmp, label %leader, label %entry.merge_crit_edge
+
+entry.merge_crit_edge:                            ; preds = %entry
+  %3 = bitcast i32 addrspace(4)* %1 to i8 addrspace(4)*
+  br label %merge
+
+leader:                                           ; preds = %entry
+  %4 = bitcast i32 addrspace(4)* %1 to i8 addrspace(4)*
+  br label %merge
+
+merge:                                            ; preds = %entry.merge_crit_edge, %leader
+  %phi = phi i8 addrspace(4)* [ %3, %entry.merge_crit_edge ], [ %4, %leader ]
+  %5 = addrspacecast i8 addrspace(3)* bitcast (%class.kfunc addrspace(3)* @"func_object1" to i8 addrspace(3)*) to i8 addrspace(4)*
+  call void @llvm.memmove.p4i8.p4i8.i64(i8 addrspace(4)* align 8 dereferenceable(32) %5, i8 addrspace(4)* align 8 dereferenceable(32) %phi, i64 32, i1 false)
+  ret void
+}
+
+declare void @llvm.memmove.p4i8.p4i8.i64(i8 addrspace(4)* captures(none) writeonly, i8 addrspace(4)* captures(none) readonly, i64, i1 immarg)
+
+declare void @llvm.memmove.p1i8.p1i8.i32(i8 addrspace(1)* captures(none), i8 addrspace(1)* captures(none) readonly, i32, i1)
diff --git a/llvm/test/CodeGen/SPIRV/llvm-intrinsics/signed_arithmetic_overflow.ll b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/signed_arithmetic_overflow.ll
new file mode 100644
index 0000000..52f939f
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/signed_arithmetic_overflow.ll
@@ -0,0 +1,30 @@
+; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -filetype=obj -o - | spirv-val %}
+; XFAIL: *
+;@llvm.sadd.with.overflow and @llvm.ssub.with.overflow has not been implemented.
+
+define spir_func void @test_sadd_overflow(ptr %out_result, ptr %out_overflow, i32 %a, i32 %b) {
+entry:
+  %res = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 %a, i32 %b)
+  %val = extractvalue { i32, i1 } %res, 0
+  %ofl = extractvalue { i32, i1 } %res, 1
+  store i32 %val, ptr %out_result
+  %zext_ofl = zext i1 %ofl to i8
+  store i8 %zext_ofl, ptr %out_overflow
+  ret void
+}
+
+declare { i32, i1 } @llvm.sadd.with.overflow.i32(i32, i32)
+
+define spir_func void @test_ssub_overflow(ptr %out_result, ptr %out_overflow, i32 %a, i32 %b) {
+entry:
+  %res = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 %a, i32 %b)
+  %val = extractvalue { i32, i1 } %res, 0
+  %ofl = extractvalue { i32, i1 } %res, 1
+  store i32 %val, ptr %out_result
+  %zext_ofl = zext i1 %ofl to i8
+  store i8 %zext_ofl, ptr %out_overflow
+  ret void
+}
+
+declare { i32, i1 } @llvm.ssub.with.overflow.i32(i32, i32)
diff --git a/llvm/test/CodeGen/SPIRV/transcoding/NoSignedUnsignedWrap.ll b/llvm/test/CodeGen/SPIRV/transcoding/NoSignedUnsignedWrap.ll
index e405ef0..5e66b8b6 100644
--- a/llvm/test/CodeGen/SPIRV/transcoding/NoSignedUnsignedWrap.ll
+++ b/llvm/test/CodeGen/SPIRV/transcoding/NoSignedUnsignedWrap.ll
@@ -7,10 +7,11 @@
 ;;
 ;; Positive tests:
 ;;
-; RUN: llc -O0 -mtriple=spirv32-unknown-unknown --spirv-ext=+SPV_KHR_no_integer_wrap_decoration %s -o - | FileCheck %s --check-prefixes=CHECK-SPIRV,CHECK-SPIRV-NEGATIVE
+; RUN: llc -O0 -mtriple=spirv32-unknown-unknown --spirv-ext=+SPV_KHR_no_integer_wrap_decoration %s -o - | FileCheck %s --check-prefixes=CHECK-SPIRV
 ;;
 ;; Negative tests:
 ;;
+; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefixes=CHECK-SPIRV-NEGATIVE
 ;; Check that backend is able to skip nsw/nuw attributes if extension is
 ;; disabled implicitly or explicitly and if max SPIR-V version is lower then 1.4
 
diff --git a/llvm/test/CodeGen/SPIRV/transcoding/OpVariable_Initializer.ll b/llvm/test/CodeGen/SPIRV/transcoding/OpVariable_Initializer.ll
new file mode 100644
index 0000000..c8953c7
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/transcoding/OpVariable_Initializer.ll
@@ -0,0 +1,11 @@
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+; CHECK-SPIRV: [[#PtrT:]] = OpTypePointer Workgroup %[[#]]
+; CHECK-SPIRV: %[[#]] = OpVariable %[[#PtrT]] Workgroup
+
+@test_atomic_fn.L = internal addrspace(3) global [64 x i32] zeroinitializer, align 4
+
+define spir_kernel void @test_atomic_fn() {
+  ret void
+}
diff --git a/llvm/test/CodeGen/SPIRV/transcoding/builtin_pipe.ll b/llvm/test/CodeGen/SPIRV/transcoding/builtin_pipe.ll
new file mode 100644
index 0000000..607997d
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/transcoding/builtin_pipe.ll
@@ -0,0 +1,140 @@
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+; CHECK: OpCapability Kernel
+; CHECK: OpCapability Addresses
+; CHECK: OpCapability Pipes
+; CHECK: OpCapability Int8
+; CHECK: OpCapability GenericPointer
+
+; CHECK-DAG: %[[#PipeWriteTy:]] = OpTypePipe WriteOnly
+; CHECK-DAG: %[[#PipeReadTy:]] = OpTypePipe ReadOnly
+; CHECK-DAG: %[[#ReserveIdTy:]] = OpTypeReserveId
+; CHECK-DAG: %[[#BoolTy:]] = OpTypeBool
+; CHECK-DAG: %[[#Int32Ty:]] = OpTypeInt 32 0
+; CHECK-DAG: %[[#Uint1:]] = OpConstant %[[#Int32Ty]] 1
+; CHECK-DAG: %[[#Uint2:]] = OpConstant %[[#Int32Ty]] 2
+; CHECK-DAG: %[[#Uint3:]] = OpConstant %[[#Int32Ty]] 3
+; CHECK-DAG: %[[#Uint4:]] = OpConstant %[[#Int32Ty]] 4
+; CHECK-DAG: %[[#NullUint:]] = OpConstantNull %[[#Int32Ty]]
+
+; CHECK: OpFunction
+; CHECK: %[[#FuncParam1:]] = OpFunctionParameter %[[#PipeWriteTy]]
+; CHECK: %[[#FuncParam2:]] = OpFunctionParameter %[[#PipeReadTy]]
+
+; CHECK: %[[#BasicWriteReserve:]] = OpReserveWritePipePackets %[[#ReserveIdTy]] %[[#FuncParam1]] %[[#Uint1]] %[[#Uint4]] %[[#Uint4]]
+; CHECK: OpWritePipe %[[#Int32Ty]] %[[#FuncParam1]] %[[#]] %[[#Uint4]] %[[#Uint4]]
+; CHECK: OpCommitWritePipe %[[#FuncParam1]] %[[#BasicWriteReserve]] %[[#Uint4]] %[[#Uint4]]
+; CHECK: %[[#BasicReadReserve:]] = OpReserveReadPipePackets %[[#ReserveIdTy]] %[[#FuncParam2]] %[[#Uint1]] %[[#Uint4]] %[[#Uint4]]
+; CHECK: OpReadPipe %[[#Int32Ty]] %[[#FuncParam2]] %[[#]] %[[#Uint4]] %[[#Uint4]]
+; CHECK: OpCommitReadPipe %[[#FuncParam2]] %[[#BasicReadReserve]] %[[#Uint4]] %[[#Uint4]]
+
+; --- Reserved pipe operations ---
+; CHECK: %[[#ReservedWriteReserve:]] = OpReserveWritePipePackets %[[#ReserveIdTy]] %[[#FuncParam1]] %[[#Uint1]] %[[#Uint4]] %[[#Uint4]]
+; CHECK: %[[#ReservedWrite:]] = OpReservedWritePipe %[[#Int32Ty]] %[[#FuncParam1]] %[[#ReservedWriteReserve]] %[[#NullUint]] %[[#]] %[[#Uint4]] %[[#Uint4]]
+; CHECK: %[[#IsValidWrite:]] = OpIsValidReserveId %[[#BoolTy]] %[[#ReservedWriteReserve]]
+; CHECK: OpCommitWritePipe %[[#FuncParam1]] %[[#ReservedWriteReserve]] %[[#Uint4]] %[[#Uint4]]
+; CHECK: %[[#ReservedReadReserve:]] = OpReserveReadPipePackets %[[#ReserveIdTy]] %[[#FuncParam2]] %[[#Uint1]] %[[#Uint4]] %[[#Uint4]]
+; CHECK: %[[#ReservedRead:]] = OpReservedReadPipe %[[#Int32Ty]] %[[#FuncParam2]] %[[#ReservedReadReserve]] %[[#NullUint]] %[[#]] %[[#Uint4]] %[[#Uint4]]
+; CHECK: %[[#IsValidRead:]] = OpIsValidReserveId %[[#BoolTy]] %[[#ReservedReadReserve]]
+; CHECK: OpCommitReadPipe %[[#FuncParam2]] %[[#ReservedReadReserve]] %[[#Uint4]] %[[#Uint4]]
+
+; --- Pipe packet queries ---
+; CHECK: %[[#MaxPacketsWO:]] = OpGetMaxPipePackets %[[#Int32Ty]] %[[#FuncParam1]] %[[#Uint4]] %[[#Uint4]]
+; CHECK: OpStore %[[#]] %[[#MaxPacketsWO]] Aligned 4
+; CHECK: %[[#NumPacketsWO:]] = OpGetNumPipePackets %[[#Int32Ty]] %[[#FuncParam1]] %[[#Uint4]] %[[#Uint4]]
+; CHECK: OpStore %[[#]] %[[#NumPacketsWO]] Aligned 4
+; CHECK: %[[#MaxPacketsRO:]] = OpGetMaxPipePackets %[[#Int32Ty]] %[[#FuncParam2]] %[[#Uint4]] %[[#Uint4]]
+; CHECK: OpStore %[[#]] %[[#MaxPacketsRO]] Aligned 4
+; CHECK: %[[#NumPacketsRO:]] = OpGetNumPipePackets %[[#Int32Ty]] %[[#FuncParam2]] %[[#Uint4]] %[[#Uint4]]
+; CHECK: OpStore %[[#]] %[[#NumPacketsRO]] Aligned 4
+
+; --- Workgroup operations ---
+; CHECK: %[[#WorkgroupWriteReserve:]] = OpGroupReserveWritePipePackets %[[#ReserveIdTy]] %[[#Uint2]] %[[#FuncParam1]] %[[#Uint1]] %[[#Uint1]] %[[#Uint1]]
+; CHECK: OpGroupCommitWritePipe %[[#Uint2]] %[[#FuncParam1]] %[[#WorkgroupWriteReserve]] %[[#Uint1]] %[[#Uint1]]
+; CHECK: %[[#WorkgroupReadReserve:]] = OpGroupReserveReadPipePackets %[[#ReserveIdTy]] %[[#Uint2]] %[[#FuncParam2]] %[[#Uint1]] %[[#Uint1]] %[[#Uint1]]
+; CHECK: OpGroupCommitReadPipe %[[#Uint2]] %[[#FuncParam2]] %[[#WorkgroupReadReserve]] %[[#Uint1]] %[[#Uint1]]
+
+; --- Subgroup operations ---
+; CHECK: %[[#SubgroupWriteReserve:]] = OpGroupReserveWritePipePackets %[[#ReserveIdTy]] %[[#Uint3]] %[[#FuncParam1]] %[[#Uint1]] %[[#Uint4]] %[[#Uint4]]
+; CHECK: OpGroupCommitWritePipe %[[#Uint3]] %[[#FuncParam1]] %[[#SubgroupWriteReserve]] %[[#Uint4]] %[[#Uint4]]
+; CHECK: %[[#SubgroupReadReserve:]] = OpGroupReserveReadPipePackets %[[#ReserveIdTy]] %[[#Uint3]] %[[#FuncParam2]] %[[#Uint1]] %[[#Uint4]] %[[#Uint4]]
+; CHECK: OpGroupCommitReadPipe %[[#Uint3]] %[[#FuncParam2]] %[[#SubgroupReadReserve]] %[[#Uint4]] %[[#Uint4]]
+
+define spir_kernel void @test_pipe_builtins(
+  target("spirv.Pipe", 1) %out_pipe,
+  target("spirv.Pipe", 0) %in_pipe,
+  ptr addrspace(4) %src,
+  ptr addrspace(4) %dst,
+  ptr addrspace(1) %max_packets_wo,
+  ptr addrspace(1) %num_packets_wo,
+  ptr addrspace(1) %max_packets_ro,
+  ptr addrspace(1) %num_packets_ro
+) {
+entry:
+  ; Basic pipe operations
+  %0 = call spir_func target("spirv.ReserveId") @__reserve_write_pipe(target("spirv.Pipe", 1) %out_pipe, i32 1, i32 4, i32 4)
+  %1 = call spir_func i32 @__write_pipe_2(target("spirv.Pipe", 1) %out_pipe, ptr addrspace(4) %src, i32 4, i32 4)
+  call spir_func void @__commit_write_pipe(target("spirv.Pipe", 1) %out_pipe, target("spirv.ReserveId") %0, i32 4, i32 4)
+  
+  %2 = call spir_func target("spirv.ReserveId") @__reserve_read_pipe(target("spirv.Pipe", 0) %in_pipe, i32 1, i32 4, i32 4)
+  %3 = call spir_func i32 @__read_pipe_2(target("spirv.Pipe", 0) %in_pipe, ptr addrspace(4) %dst, i32 4, i32 4)
+  call spir_func void @__commit_read_pipe(target("spirv.Pipe", 0) %in_pipe, target("spirv.ReserveId") %2, i32 4, i32 4)
+  
+  ; Reserved pipe operations
+  %4 = call spir_func target("spirv.ReserveId") @__reserve_write_pipe(target("spirv.Pipe", 1) %out_pipe, i32 1, i32 4, i32 4)
+  %5 = call spir_func i32 @__write_pipe_4(target("spirv.Pipe", 1) %out_pipe, target("spirv.ReserveId") %4, i32 0, ptr addrspace(4) %src, i32 4, i32 4)
+  %6 = call spir_func i1 @_Z19is_valid_reserve_id13ocl_reserveid(target("spirv.ReserveId") %4)
+  call spir_func void @__commit_write_pipe(target("spirv.Pipe", 1) %out_pipe, target("spirv.ReserveId") %4, i32 4, i32 4)
+  
+  %7 = call spir_func target("spirv.ReserveId") @__reserve_read_pipe(target("spirv.Pipe", 0) %in_pipe, i32 1, i32 4, i32 4)
+  %8 = call spir_func i32 @__read_pipe_4(target("spirv.Pipe", 0) %in_pipe, target("spirv.ReserveId") %7, i32 0, ptr addrspace(4) %dst, i32 4, i32 4)
+  %9 = call spir_func i1 @_Z19is_valid_reserve_id13ocl_reserveid(target("spirv.ReserveId") %7)
+  call spir_func void @__commit_read_pipe(target("spirv.Pipe", 0) %in_pipe, target("spirv.ReserveId") %7, i32 4, i32 4)
+  
+  ; Pipe packet queries
+  %10 = call spir_func i32 @__get_pipe_max_packets_wo(target("spirv.Pipe", 1) %out_pipe, i32 4, i32 4)
+  store i32 %10, ptr addrspace(1) %max_packets_wo, align 4
+  %11 = call spir_func i32 @__get_pipe_num_packets_wo(target("spirv.Pipe", 1) %out_pipe, i32 4, i32 4)
+  store i32 %11, ptr addrspace(1) %num_packets_wo, align 4
+  %12 = call spir_func i32 @__get_pipe_max_packets_ro(target("spirv.Pipe", 0) %in_pipe, i32 4, i32 4)
+  store i32 %12, ptr addrspace(1) %max_packets_ro, align 4
+  %13 = call spir_func i32 @__get_pipe_num_packets_ro(target("spirv.Pipe", 0) %in_pipe, i32 4, i32 4)
+  store i32 %13, ptr addrspace(1) %num_packets_ro, align 4
+  
+  ; Workgroup operations
+  %14 = call spir_func target("spirv.ReserveId") @__work_group_reserve_write_pipe(target("spirv.Pipe", 1) %out_pipe, i32 1, i32 1, i32 1)
+  call spir_func void @__work_group_commit_write_pipe(target("spirv.Pipe", 1) %out_pipe, target("spirv.ReserveId") %14, i32 1, i32 1)
+  %15 = call spir_func target("spirv.ReserveId") @__work_group_reserve_read_pipe(target("spirv.Pipe", 0) %in_pipe, i32 1, i32 1, i32 1)
+  call spir_func void @__work_group_commit_read_pipe(target("spirv.Pipe", 0) %in_pipe, target("spirv.ReserveId") %15, i32 1, i32 1)
+  
+  ; Subgroup operations
+  %16 = call spir_func target("spirv.ReserveId") @__sub_group_reserve_write_pipe(target("spirv.Pipe", 1) %out_pipe, i32 1, i32 4, i32 4)
+  call spir_func void @__sub_group_commit_write_pipe(target("spirv.Pipe", 1) %out_pipe, target("spirv.ReserveId") %16, i32 4, i32 4)
+  %17 = call spir_func target("spirv.ReserveId") @__sub_group_reserve_read_pipe(target("spirv.Pipe", 0) %in_pipe, i32 1, i32 4, i32 4)
+  call spir_func void @__sub_group_commit_read_pipe(target("spirv.Pipe", 0) %in_pipe, target("spirv.ReserveId") %17, i32 4, i32 4)
+  
+  ret void
+}
+
+declare spir_func target("spirv.ReserveId") @__reserve_write_pipe(target("spirv.Pipe", 1), i32, i32, i32)
+declare spir_func target("spirv.ReserveId") @__reserve_read_pipe(target("spirv.Pipe", 0), i32, i32, i32)
+declare spir_func i32 @__write_pipe_2(target("spirv.Pipe", 1), ptr addrspace(4), i32, i32)
+declare spir_func i32 @__read_pipe_2(target("spirv.Pipe", 0), ptr addrspace(4), i32, i32)
+declare spir_func i32 @__write_pipe_4(target("spirv.Pipe", 1), target("spirv.ReserveId"), i32, ptr addrspace(4), i32, i32)
+declare spir_func i32 @__read_pipe_4(target("spirv.Pipe", 0), target("spirv.ReserveId"), i32, ptr addrspace(4), i32, i32)
+declare spir_func void @__commit_write_pipe(target("spirv.Pipe", 1), target("spirv.ReserveId"), i32, i32)
+declare spir_func void @__commit_read_pipe(target("spirv.Pipe", 0), target("spirv.ReserveId"), i32, i32)
+declare spir_func i1 @_Z19is_valid_reserve_id13ocl_reserveid(target("spirv.ReserveId"))
+declare spir_func i32 @__get_pipe_max_packets_wo(target("spirv.Pipe", 1), i32, i32)
+declare spir_func i32 @__get_pipe_num_packets_wo(target("spirv.Pipe", 1), i32, i32)
+declare spir_func i32 @__get_pipe_max_packets_ro(target("spirv.Pipe", 0), i32, i32)
+declare spir_func i32 @__get_pipe_num_packets_ro(target("spirv.Pipe", 0), i32, i32)
+declare spir_func target("spirv.ReserveId") @__work_group_reserve_write_pipe(target("spirv.Pipe", 1), i32, i32, i32)
+declare spir_func void @__work_group_commit_write_pipe(target("spirv.Pipe", 1), target("spirv.ReserveId"), i32, i32)
+declare spir_func target("spirv.ReserveId") @__work_group_reserve_read_pipe(target("spirv.Pipe", 0), i32, i32, i32)
+declare spir_func void @__work_group_commit_read_pipe(target("spirv.Pipe", 0), target("spirv.ReserveId"), i32, i32)
+declare spir_func target("spirv.ReserveId") @__sub_group_reserve_write_pipe(target("spirv.Pipe", 1), i32, i32, i32)
+declare spir_func void @__sub_group_commit_write_pipe(target("spirv.Pipe", 1), target("spirv.ReserveId"), i32, i32)
+declare spir_func target("spirv.ReserveId") @__sub_group_reserve_read_pipe(target("spirv.Pipe", 0), i32, i32, i32)
+declare spir_func void @__sub_group_commit_read_pipe(target("spirv.Pipe", 0), target("spirv.ReserveId"), i32, i32)
diff --git a/llvm/test/CodeGen/SPIRV/transcoding/builtin_vars_gep.ll b/llvm/test/CodeGen/SPIRV/transcoding/builtin_vars_gep.ll
new file mode 100644
index 0000000..4c64a12
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/transcoding/builtin_vars_gep.ll
@@ -0,0 +1,16 @@
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+; CHECK: OpDecorate %[[#Id:]] BuiltIn GlobalInvocationId
+; CHECK: %[[#Id]] = OpVariable %[[#]] CrossWorkgroup
+
+@__spirv_BuiltInGlobalInvocationId = external dso_local local_unnamed_addr addrspace(1) constant <3 x i64>, align 32
+
+define spir_kernel void @f() {
+entry:
+  %0 = load i64, ptr addrspace(1) @__spirv_BuiltInGlobalInvocationId, align 32
+  ret void
+}
diff --git a/llvm/test/CodeGen/SPIRV/transcoding/decoration-forward-decl.ll b/llvm/test/CodeGen/SPIRV/transcoding/decoration-forward-decl.ll
new file mode 100644
index 0000000..74ce26b
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/transcoding/decoration-forward-decl.ll
@@ -0,0 +1,30 @@
+; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+; Check saturation conversion is translated when there is forward declaration
+; of SPIRV entry.
+
+; CHECK: OpDecorate %[[#SAT:]] SaturatedConversion
+; CHECK: %[[#SAT]] = OpConvertFToU %[[#]] %[[#]]
+
+declare spir_func zeroext i8 @_Z30__spirv_ConvertFToU_Ruchar_satf(float)
+
+define spir_func void @forward(float %val, i8 %initval, ptr addrspace(1) %dst) {
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.body, %entry
+  %new_val.0 = phi i8 [ %initval, %entry ], [ %call1, %for.body ]
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %cmp = icmp ult i32 %i.0, 1
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %call1 = call spir_func zeroext i8 @_Z30__spirv_ConvertFToU_Ruchar_satf(float noundef %val)
+  %inc = add i32 %i.0, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  store i8 %new_val.0, ptr addrspace(1) %dst, align 1
+  ret void
+}
diff --git a/llvm/test/CodeGen/SPIRV/transcoding/float16.ll b/llvm/test/CodeGen/SPIRV/transcoding/float16.ll
new file mode 100644
index 0000000..0018dba
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/transcoding/float16.ll
@@ -0,0 +1,25 @@
+; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+; CHECK-SPIRV: %[[#HALF:]] = OpTypeFloat 16
+; CHECK-SPIRV: %[[#HALFPTR:]] = OpTypePointer Function %[[#HALF]]
+; CHECK-SPIRV: %[[#HALFV2:]] = OpTypeVector %[[#HALF]] 2
+; CHECK-SPIRV: %[[#HALFV2PTR:]] = OpTypePointer Function %[[#HALFV2]]
+; CHECK-SPIRV: %[[#CONST:]] = OpConstant %[[#HALF]] 14788
+; CHECK-SPIRV: %[[#ADDR:]] = OpVariable %[[#HALFPTR]] Function
+; CHECK-SPIRV: %[[#ADDR2:]] = OpVariable %[[#HALFV2PTR]] Function
+; CHECK-SPIRV: %[[#]] = OpExtInst %[[#HALF]] %[[#]] fract %[[#CONST]] %[[#ADDR]]
+; CHECK-SPIRV: %[[#]] = OpExtInst %[[#HALFV2]] %[[#]] fract %[[#]] %[[#ADDR2]]
+
+define spir_kernel void @test() {
+entry:
+  %addr = alloca half
+  %addr2 = alloca <2 x half>
+  %res = call spir_func noundef half @_Z17__spirv_ocl_fractDF16_PU3AS0DF16_(half noundef 0xH39C4, ptr noundef %addr)
+  %res2 = call spir_func noundef <2 x half> @_Z17__spirv_ocl_fractDv2_DF16_PU3AS0S_(<2 x half> noundef <half 0xH39C4, half 0xH0000>, ptr noundef %addr2)
+  ret void
+}
+
+declare spir_func noundef half @_Z17__spirv_ocl_fractDF16_PU3AS0DF16_(half noundef, ptr noundef) local_unnamed_addr
+
+declare spir_func noundef <2 x half> @_Z17__spirv_ocl_fractDv2_DF16_PU3AS0S_(<2 x half> noundef, ptr noundef) local_unnamed_addr
diff --git a/llvm/test/CodeGen/X86/apx/ndd-neg-addr-index.ll b/llvm/test/CodeGen/X86/apx/ndd-neg-addr-index.ll
index 6679b5f5..41fa346 100644
--- a/llvm/test/CodeGen/X86/apx/ndd-neg-addr-index.ll
+++ b/llvm/test/CodeGen/X86/apx/ndd-neg-addr-index.ll
@@ -8,7 +8,7 @@ define void @neg_8bit_1(i1 %cmp) {
 ; NDD-NEXT:    andb $1, %dil, %al # encoding: [0x62,0xf4,0x7c,0x18,0x80,0xe7,0x01]
 ; NDD-NEXT:    movzbl 0, %ecx # encoding: [0x0f,0xb6,0x0c,0x25,0x00,0x00,0x00,0x00]
 ; NDD-NEXT:    negb %al, %al # encoding: [0x62,0xf4,0x7c,0x18,0xf6,0xd8]
-; NDD-NEXT:    leab 2(%rcx,%rax), %al # encoding: [0x66,0x8d,0x44,0x01,0x02]
+; NDD-NEXT:    leal 2(%rcx,%rax), %eax # encoding: [0x8d,0x44,0x01,0x02]
 ; NDD-NEXT:    movb %al, 0 # encoding: [0x88,0x04,0x25,0x00,0x00,0x00,0x00]
 ; NDD-NEXT:    retq # encoding: [0xc3]
 entry:
@@ -25,7 +25,8 @@ define void @neg_8bit_2(i8 %int8) {
 ; NDD-NEXT:    # kill: def $edi killed $edi def $rdi
 ; NDD-NEXT:    addb %dil, %dil, %al # encoding: [0x62,0xf4,0x7c,0x18,0x00,0xff]
 ; NDD-NEXT:    negb %al, %al # encoding: [0x62,0xf4,0x7c,0x18,0xf6,0xd8]
-; NDD-NEXT:    leab 1(%rdi,%rax), %al # encoding: [0x66,0x8d,0x44,0x07,0x01]
+; NDD-NEXT:    leal 1(%rdi,%rax), %eax # encoding: [0x8d,0x44,0x07,0x01]
+; NDD-NEXT:    # kill: def $al killed $al killed $eax
 ; NDD-NEXT:    mulb %dil # encoding: [0x40,0xf6,0xe7]
 ; NDD-NEXT:    testb %al, %al # encoding: [0x84,0xc0]
 ; NDD-NEXT:    retq # encoding: [0xc3]
@@ -55,7 +56,7 @@ define i32 @neg_16bit(i16 %0) {
 ; NDD-NEXT:    cmovsl %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x48,0xc1]
 ; NDD-NEXT:    andw $-256, %ax # EVEX TO LEGACY Compression encoding: [0x66,0x25,0x00,0xff]
 ; NDD-NEXT:    negw %ax, %ax # encoding: [0x62,0xf4,0x7d,0x18,0xf7,0xd8]
-; NDD-NEXT:    leaw 1(%rdi,%rax), %ax # encoding: [0x66,0x8d,0x44,0x07,0x01]
+; NDD-NEXT:    leal 1(%rdi,%rax), %eax # encoding: [0x8d,0x44,0x07,0x01]
 ; NDD-NEXT:    movzwl %ax, %eax # encoding: [0x0f,0xb7,0xc0]
 ; NDD-NEXT:    movq %rax, 0 # encoding: [0x48,0x89,0x04,0x25,0x00,0x00,0x00,0x00]
 ; NDD-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
diff --git a/llvm/test/CodeGen/X86/avx512-mask-op.ll b/llvm/test/CodeGen/X86/avx512-mask-op.ll
index 8aa898f..da0cef0 100644
--- a/llvm/test/CodeGen/X86/avx512-mask-op.ll
+++ b/llvm/test/CodeGen/X86/avx512-mask-op.ll
@@ -2119,8 +2119,7 @@ define void @ktest_1(<8 x double> %in, ptr %base) {
 ; KNL-LABEL: ktest_1:
 ; KNL:       ## %bb.0:
 ; KNL-NEXT:    vcmpgtpd (%rdi), %zmm0, %k1
-; KNL-NEXT:    vmovupd 8(%rdi), %zmm1 {%k1} {z}
-; KNL-NEXT:    vcmpltpd %zmm1, %zmm0, %k0 {%k1}
+; KNL-NEXT:    vcmpltpd 8(%rdi), %zmm0, %k0 {%k1}
 ; KNL-NEXT:    kmovw %k0, %eax
 ; KNL-NEXT:    testb %al, %al
 ; KNL-NEXT:    je LBB44_2
@@ -2152,8 +2151,7 @@ define void @ktest_1(<8 x double> %in, ptr %base) {
 ; AVX512BW-LABEL: ktest_1:
 ; AVX512BW:       ## %bb.0:
 ; AVX512BW-NEXT:    vcmpgtpd (%rdi), %zmm0, %k1
-; AVX512BW-NEXT:    vmovupd 8(%rdi), %zmm1 {%k1} {z}
-; AVX512BW-NEXT:    vcmpltpd %zmm1, %zmm0, %k0 {%k1}
+; AVX512BW-NEXT:    vcmpltpd 8(%rdi), %zmm0, %k0 {%k1}
 ; AVX512BW-NEXT:    kmovd %k0, %eax
 ; AVX512BW-NEXT:    testb %al, %al
 ; AVX512BW-NEXT:    je LBB44_2
diff --git a/llvm/test/CodeGen/X86/combine-add.ll b/llvm/test/CodeGen/X86/combine-add.ll
index ff9f995..51a8bf5 100644
--- a/llvm/test/CodeGen/X86/combine-add.ll
+++ b/llvm/test/CodeGen/X86/combine-add.ll
@@ -235,10 +235,10 @@ define void @PR52039(ptr %pa, ptr %pb) {
 ; SSE-NEXT:    psubd %xmm1, %xmm3
 ; SSE-NEXT:    psubd %xmm0, %xmm2
 ; SSE-NEXT:    movdqa %xmm2, %xmm0
-; SSE-NEXT:    paddd %xmm2, %xmm0
+; SSE-NEXT:    paddd %xmm0, %xmm0
 ; SSE-NEXT:    paddd %xmm2, %xmm0
 ; SSE-NEXT:    movdqa %xmm3, %xmm1
-; SSE-NEXT:    paddd %xmm3, %xmm1
+; SSE-NEXT:    paddd %xmm1, %xmm1
 ; SSE-NEXT:    paddd %xmm3, %xmm1
 ; SSE-NEXT:    movdqu %xmm3, 16(%rsi)
 ; SSE-NEXT:    movdqu %xmm2, (%rsi)
diff --git a/llvm/test/CodeGen/X86/combine-mul.ll b/llvm/test/CodeGen/X86/combine-mul.ll
index 8e4a50e..ae4d24f 100644
--- a/llvm/test/CodeGen/X86/combine-mul.ll
+++ b/llvm/test/CodeGen/X86/combine-mul.ll
@@ -81,7 +81,7 @@ define <4 x i64> @combine_vec_mul_pow2c(<4 x i64> %x) {
 ; SSE-LABEL: combine_vec_mul_pow2c:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movdqa %xmm0, %xmm2
-; SSE-NEXT:    paddq %xmm0, %xmm2
+; SSE-NEXT:    paddq %xmm2, %xmm2
 ; SSE-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
 ; SSE-NEXT:    movdqa %xmm1, %xmm2
 ; SSE-NEXT:    psllq $4, %xmm2
diff --git a/llvm/test/CodeGen/X86/combine-sdiv.ll b/llvm/test/CodeGen/X86/combine-sdiv.ll
index 98187d6..6bcbfe1 100644
--- a/llvm/test/CodeGen/X86/combine-sdiv.ll
+++ b/llvm/test/CodeGen/X86/combine-sdiv.ll
@@ -2187,13 +2187,13 @@ define <16 x i8> @non_splat_minus_one_divisor_1(<16 x i8> %A) {
 ; SSE41-NEXT:    pxor %xmm0, %xmm0
 ; SSE41-NEXT:    pxor %xmm3, %xmm3
 ; SSE41-NEXT:    pcmpgtb %xmm1, %xmm3
-; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm4 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
 ; SSE41-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
 ; SSE41-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [256,2,2,2,2,128,2,128]
 ; SSE41-NEXT:    psrlw $8, %xmm3
-; SSE41-NEXT:    paddw %xmm4, %xmm4
-; SSE41-NEXT:    pmovsxbw %xmm1, %xmm2
-; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2],xmm2[3,4,5],xmm4[6],xmm2[7]
+; SSE41-NEXT:    pmovsxbw %xmm1, %xmm0
+; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; SSE41-NEXT:    paddw %xmm2, %xmm2
+; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2],xmm0[3,4,5],xmm2[6],xmm0[7]
 ; SSE41-NEXT:    psrlw $8, %xmm2
 ; SSE41-NEXT:    packuswb %xmm3, %xmm2
 ; SSE41-NEXT:    paddb %xmm1, %xmm2
@@ -2201,15 +2201,14 @@ define <16 x i8> @non_splat_minus_one_divisor_1(<16 x i8> %A) {
 ; SSE41-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
 ; SSE41-NEXT:    psraw $8, %xmm0
 ; SSE41-NEXT:    movdqa %xmm0, %xmm3
-; SSE41-NEXT:    paddw %xmm0, %xmm3
-; SSE41-NEXT:    psllw $7, %xmm0
-; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm3[5],xmm0[6],xmm3[7]
-; SSE41-NEXT:    psrlw $8, %xmm0
+; SSE41-NEXT:    psllw $7, %xmm3
+; SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm0[5],xmm3[6],xmm0[7]
+; SSE41-NEXT:    psrlw $8, %xmm3
 ; SSE41-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; SSE41-NEXT:    psraw $8, %xmm2
 ; SSE41-NEXT:    psllw $7, %xmm2
 ; SSE41-NEXT:    psrlw $8, %xmm2
-; SSE41-NEXT:    packuswb %xmm0, %xmm2
+; SSE41-NEXT:    packuswb %xmm3, %xmm2
 ; SSE41-NEXT:    movaps {{.*#+}} xmm0 = [0,0,255,0,0,0,255,0,0,255,255,255,255,255,255,255]
 ; SSE41-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
 ; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [255,255,0,255,255,255,0,255,255,0,0,0,0,255,0,255]
@@ -2225,18 +2224,17 @@ define <16 x i8> @non_splat_minus_one_divisor_1(<16 x i8> %A) {
 ; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
 ; AVX1-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [256,2,2,2,2,128,2,128]
 ; AVX1-NEXT:    vpsrlw $8, %xmm1, %xmm1
-; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
-; AVX1-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
-; AVX1-NEXT:    vpmovsxbw %xmm0, %xmm3
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2],xmm3[3,4,5],xmm2[6],xmm3[7]
+; AVX1-NEXT:    vpmovsxbw %xmm0, %xmm2
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX1-NEXT:    vpaddw %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3,4,5],xmm3[6],xmm2[7]
 ; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
 ; AVX1-NEXT:    vpackuswb %xmm1, %xmm2, %xmm1
 ; AVX1-NEXT:    vpaddb %xmm1, %xmm0, %xmm1
 ; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 ; AVX1-NEXT:    vpsraw $8, %xmm2, %xmm2
-; AVX1-NEXT:    vpaddw %xmm2, %xmm2, %xmm3
-; AVX1-NEXT:    vpsllw $7, %xmm2, %xmm2
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm3[5],xmm2[6],xmm3[7]
+; AVX1-NEXT:    vpsllw $7, %xmm2, %xmm3
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3,4],xmm2[5],xmm3[6],xmm2[7]
 ; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
 ; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; AVX1-NEXT:    vpsraw $8, %xmm1, %xmm1
diff --git a/llvm/test/CodeGen/X86/dbg-distringtype-uint.ll b/llvm/test/CodeGen/X86/dbg-distringtype-uint.ll
index 638a65d..7542c1b 100644
--- a/llvm/test/CodeGen/X86/dbg-distringtype-uint.ll
+++ b/llvm/test/CodeGen/X86/dbg-distringtype-uint.ll
@@ -1,5 +1,13 @@
 ; RUN: llc -mtriple=x86_64 -filetype=obj < %s | llvm-dwarfdump -debug-info - | FileCheck %s
-;
+
+; Ensure that static local variable elemnt is placed in abstract subprogram DIE.
+; CHECK:                   DW_TAG_subprogram
+; CHECK-NOT:               DW_TAG
+; CHECK:                     DW_AT_inline  (DW_INL_inlined)
+; CHECK-EMPTY:
+; CHECK-NEXT:                DW_TAG_variable
+; CHECK-NEXT:                  DW_AT_name  ("elemnt")
+
 ; CHECK: [[SYM:[a-z0-9]+]]:  DW_TAG_formal_parameter
 ; CHECK:                     DW_AT_name	("esym")
 ; CHECK:                     DW_AT_type	([[TYPE:[a-z0-9]+]] "CHARACTER_1")
diff --git a/llvm/test/CodeGen/X86/dpbusd.ll b/llvm/test/CodeGen/X86/dpbusd.ll
index 3aa77c3..7bd22d5 100644
--- a/llvm/test/CodeGen/X86/dpbusd.ll
+++ b/llvm/test/CodeGen/X86/dpbusd.ll
@@ -1,40 +1,25 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avxvnni | FileCheck %s --check-prefixes=AVXVNNI
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vnni | FileCheck %s --check-prefixes=AVX512,AVX512VNNI
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vnni -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VLVNNI
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avxvnni | FileCheck %s --check-prefixes=CHECK,AVXVNNI,AVXVNNI-AVX
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avxvnni,+avx512vl | FileCheck %s --check-prefixes=CHECK,AVXVNNI,AVXVNNI-AVX512
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vnni | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512VNNI
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vnni,+avx512vl | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512VLVNNI
 
 define i32 @no_dpbusd(ptr%a, ptr%b, i32 %c, i32 %n) {
-; AVXVNNI-LABEL: no_dpbusd:
-; AVXVNNI:       # %bb.0: # %entry
-; AVXVNNI-NEXT:    vpmovzxbw {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
-; AVXVNNI-NEXT:    vpmovzxbw {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
-; AVXVNNI-NEXT:    vpmaddwd %ymm0, %ymm1, %ymm0
-; AVXVNNI-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVXVNNI-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVXVNNI-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; AVXVNNI-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVXVNNI-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; AVXVNNI-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVXVNNI-NEXT:    vmovd %xmm0, %eax
-; AVXVNNI-NEXT:    addl %edx, %eax
-; AVXVNNI-NEXT:    vzeroupper
-; AVXVNNI-NEXT:    retq
-;
-; AVX512-LABEL: no_dpbusd:
-; AVX512:       # %bb.0: # %entry
-; AVX512-NEXT:    vpmovzxbw {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
-; AVX512-NEXT:    vpmovzxbw {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
-; AVX512-NEXT:    vpmaddwd %ymm0, %ymm1, %ymm0
-; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vmovd %xmm0, %eax
-; AVX512-NEXT:    addl %edx, %eax
-; AVX512-NEXT:    vzeroupper
-; AVX512-NEXT:    retq
+; CHECK-LABEL: no_dpbusd:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vpmovzxbw {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
+; CHECK-NEXT:    vpmovzxbw {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
+; CHECK-NEXT:    vpmaddwd %ymm0, %ymm1, %ymm0
+; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; CHECK-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; CHECK-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; CHECK-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vmovd %xmm0, %eax
+; CHECK-NEXT:    addl %edx, %eax
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
 entry:
   %0 = load <16 x i8>, ptr %a, align 16
   %1 = zext <16 x i8> %0 to <16 x i32>
@@ -99,25 +84,44 @@ entry:
 }
 
 define i32 @mul_zext(ptr%a, ptr%b, i32 %c, i32 %n) {
-; AVXVNNI-LABEL: mul_zext:
-; AVXVNNI:       # %bb.0: # %entry
-; AVXVNNI-NEXT:    vpmovzxbw {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
-; AVXVNNI-NEXT:    vpmovsxbw (%rsi), %ymm1
-; AVXVNNI-NEXT:    vpmullw %ymm0, %ymm1, %ymm0
-; AVXVNNI-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVXVNNI-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; AVXVNNI-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVXVNNI-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
-; AVXVNNI-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVXVNNI-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVXVNNI-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; AVXVNNI-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVXVNNI-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; AVXVNNI-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVXVNNI-NEXT:    vmovd %xmm0, %eax
-; AVXVNNI-NEXT:    addl %edx, %eax
-; AVXVNNI-NEXT:    vzeroupper
-; AVXVNNI-NEXT:    retq
+; AVXVNNI-AVX-LABEL: mul_zext:
+; AVXVNNI-AVX:       # %bb.0: # %entry
+; AVXVNNI-AVX-NEXT:    vpmovzxbw {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
+; AVXVNNI-AVX-NEXT:    vpmovsxbw (%rsi), %ymm1
+; AVXVNNI-AVX-NEXT:    vpmullw %ymm0, %ymm1, %ymm0
+; AVXVNNI-AVX-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVXVNNI-AVX-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVXVNNI-AVX-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVXVNNI-AVX-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
+; AVXVNNI-AVX-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVXVNNI-AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVXVNNI-AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVXVNNI-AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVXVNNI-AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; AVXVNNI-AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVXVNNI-AVX-NEXT:    vmovd %xmm0, %eax
+; AVXVNNI-AVX-NEXT:    addl %edx, %eax
+; AVXVNNI-AVX-NEXT:    vzeroupper
+; AVXVNNI-AVX-NEXT:    retq
+;
+; AVXVNNI-AVX512-LABEL: mul_zext:
+; AVXVNNI-AVX512:       # %bb.0: # %entry
+; AVXVNNI-AVX512-NEXT:    vpmovzxbw {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
+; AVXVNNI-AVX512-NEXT:    vpmovsxbw (%rsi), %ymm1
+; AVXVNNI-AVX512-NEXT:    vpmullw %ymm0, %ymm1, %ymm0
+; AVXVNNI-AVX512-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVXVNNI-AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVXVNNI-AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
+; AVXVNNI-AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVXVNNI-AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVXVNNI-AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVXVNNI-AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVXVNNI-AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; AVXVNNI-AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVXVNNI-AVX512-NEXT:    vmovd %xmm0, %eax
+; AVXVNNI-AVX512-NEXT:    addl %edx, %eax
+; AVXVNNI-AVX512-NEXT:    vzeroupper
+; AVXVNNI-AVX512-NEXT:    retq
 ;
 ; AVX512-LABEL: mul_zext:
 ; AVX512:       # %bb.0: # %entry
@@ -153,25 +157,44 @@ entry:
 }
 
 define i32 @mul_sext(ptr%a, ptr%b, i32 %c, i32 %n) {
-; AVXVNNI-LABEL: mul_sext:
-; AVXVNNI:       # %bb.0: # %entry
-; AVXVNNI-NEXT:    vpmovzxbw {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
-; AVXVNNI-NEXT:    vpmovsxbw (%rsi), %ymm1
-; AVXVNNI-NEXT:    vpmullw %ymm0, %ymm1, %ymm0
-; AVXVNNI-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVXVNNI-NEXT:    vpmovsxwd %xmm1, %ymm1
-; AVXVNNI-NEXT:    vpmovsxwd %xmm0, %ymm0
-; AVXVNNI-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
-; AVXVNNI-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVXVNNI-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVXVNNI-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; AVXVNNI-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVXVNNI-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; AVXVNNI-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVXVNNI-NEXT:    vmovd %xmm0, %eax
-; AVXVNNI-NEXT:    addl %edx, %eax
-; AVXVNNI-NEXT:    vzeroupper
-; AVXVNNI-NEXT:    retq
+; AVXVNNI-AVX-LABEL: mul_sext:
+; AVXVNNI-AVX:       # %bb.0: # %entry
+; AVXVNNI-AVX-NEXT:    vpmovzxbw {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
+; AVXVNNI-AVX-NEXT:    vpmovsxbw (%rsi), %ymm1
+; AVXVNNI-AVX-NEXT:    vpmullw %ymm0, %ymm1, %ymm0
+; AVXVNNI-AVX-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVXVNNI-AVX-NEXT:    vpmovsxwd %xmm1, %ymm1
+; AVXVNNI-AVX-NEXT:    vpmovsxwd %xmm0, %ymm0
+; AVXVNNI-AVX-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
+; AVXVNNI-AVX-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVXVNNI-AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVXVNNI-AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVXVNNI-AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVXVNNI-AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; AVXVNNI-AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVXVNNI-AVX-NEXT:    vmovd %xmm0, %eax
+; AVXVNNI-AVX-NEXT:    addl %edx, %eax
+; AVXVNNI-AVX-NEXT:    vzeroupper
+; AVXVNNI-AVX-NEXT:    retq
+;
+; AVXVNNI-AVX512-LABEL: mul_sext:
+; AVXVNNI-AVX512:       # %bb.0: # %entry
+; AVXVNNI-AVX512-NEXT:    vpmovzxbw {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
+; AVXVNNI-AVX512-NEXT:    vpmovsxbw (%rsi), %ymm1
+; AVXVNNI-AVX512-NEXT:    vpmullw %ymm0, %ymm1, %ymm0
+; AVXVNNI-AVX512-NEXT:    vpmovsxwd %ymm0, %zmm0
+; AVXVNNI-AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVXVNNI-AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
+; AVXVNNI-AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVXVNNI-AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVXVNNI-AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVXVNNI-AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVXVNNI-AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; AVXVNNI-AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVXVNNI-AVX512-NEXT:    vmovd %xmm0, %eax
+; AVXVNNI-AVX512-NEXT:    addl %edx, %eax
+; AVXVNNI-AVX512-NEXT:    vzeroupper
+; AVXVNNI-AVX512-NEXT:    retq
 ;
 ; AVX512-LABEL: mul_sext:
 ; AVX512:       # %bb.0: # %entry
@@ -312,17 +335,30 @@ entry:
 declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>)
 
 define i32 @vpdpbusd_128(ptr%a, ptr%b, i32 %c, i32 %n) {
-; AVXVNNI-LABEL: vpdpbusd_128:
-; AVXVNNI:       # %bb.0: # %entry
-; AVXVNNI-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
-; AVXVNNI-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
-; AVXVNNI-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVXVNNI-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3]
-; AVXVNNI-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
-; AVXVNNI-NEXT:    {vex} vpdpbusd %xmm1, %xmm0, %xmm2
-; AVXVNNI-NEXT:    vmovd %xmm2, %eax
-; AVXVNNI-NEXT:    addl %edx, %eax
-; AVXVNNI-NEXT:    retq
+; AVXVNNI-AVX-LABEL: vpdpbusd_128:
+; AVXVNNI-AVX:       # %bb.0: # %entry
+; AVXVNNI-AVX-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVXVNNI-AVX-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVXVNNI-AVX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVXVNNI-AVX-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3]
+; AVXVNNI-AVX-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
+; AVXVNNI-AVX-NEXT:    {vex} vpdpbusd %xmm1, %xmm0, %xmm2
+; AVXVNNI-AVX-NEXT:    vmovd %xmm2, %eax
+; AVXVNNI-AVX-NEXT:    addl %edx, %eax
+; AVXVNNI-AVX-NEXT:    retq
+;
+; AVXVNNI-AVX512-LABEL: vpdpbusd_128:
+; AVXVNNI-AVX512:       # %bb.0: # %entry
+; AVXVNNI-AVX512-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVXVNNI-AVX512-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVXVNNI-AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVXVNNI-AVX512-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3]
+; AVXVNNI-AVX512-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
+; AVXVNNI-AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVXVNNI-AVX512-NEXT:    {vex} vpdpbusd %xmm1, %xmm0, %xmm2
+; AVXVNNI-AVX512-NEXT:    vmovd %xmm2, %eax
+; AVXVNNI-AVX512-NEXT:    addl %edx, %eax
+; AVXVNNI-AVX512-NEXT:    retq
 ;
 ; AVX512VNNI-LABEL: vpdpbusd_128:
 ; AVX512VNNI:       # %bb.0: # %entry
diff --git a/llvm/test/CodeGen/X86/dpbusd_const.ll b/llvm/test/CodeGen/X86/dpbusd_const.ll
index 456e6e8..bb47df5 100644
--- a/llvm/test/CodeGen/X86/dpbusd_const.ll
+++ b/llvm/test/CodeGen/X86/dpbusd_const.ll
@@ -1,20 +1,21 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avxvnni | FileCheck %s --check-prefixes=ALL,AVXVNNI
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vnni | FileCheck %s --check-prefixes=ALL,AVX512,AVX512VNNI
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vnni -mattr=+avx512vl | FileCheck %s --check-prefixes=ALL,AVX512,AVX512VLVNNI
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avxvnni | FileCheck %s --check-prefixes=CHECK,AVXVNNI,AVXVNNI-AVX
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avxvnni,+avx512vl | FileCheck %s --check-prefixes=CHECK,AVXVNNI,AVXVNNI-AVX512
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vnni | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512VNNI
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vnni,+avx512vl | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512VLVNNI
 
 define i32 @mul_4xi8_zc_exceed(<4 x i8> %a, i32 %c) {
-; ALL-LABEL: mul_4xi8_zc_exceed:
-; ALL:       # %bb.0: # %entry
-; ALL-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; ALL-NEXT:    vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [0,0,1,0,2,0,128,0]
-; ALL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; ALL-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; ALL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; ALL-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
-; ALL-NEXT:    vmovd %xmm0, %eax
-; ALL-NEXT:    addl %edi, %eax
-; ALL-NEXT:    retq
+; CHECK-LABEL: mul_4xi8_zc_exceed:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; CHECK-NEXT:    vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [0,0,1,0,2,0,128,0]
+; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; CHECK-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; CHECK-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    vmovd %xmm0, %eax
+; CHECK-NEXT:    addl %edi, %eax
+; CHECK-NEXT:    retq
 entry:
   %0 = zext <4 x i8> %a to <4 x i32>
   %1 = mul nsw <4 x i32> %0, <i32 0, i32 1, i32 2, i32 128>
@@ -24,14 +25,24 @@ entry:
 }
 
 define i32 @mul_4xi8_zc(<4 x i8> %a, i32 %c) {
-; AVXVNNI-LABEL: mul_4xi8_zc:
-; AVXVNNI:       # %bb.0: # %entry
-; AVXVNNI-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVXVNNI-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
-; AVXVNNI-NEXT:    {vex} vpdpbusd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
-; AVXVNNI-NEXT:    vmovd %xmm1, %eax
-; AVXVNNI-NEXT:    addl %edi, %eax
-; AVXVNNI-NEXT:    retq
+; AVXVNNI-AVX-LABEL: mul_4xi8_zc:
+; AVXVNNI-AVX:       # %bb.0: # %entry
+; AVXVNNI-AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVXVNNI-AVX-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVXVNNI-AVX-NEXT:    {vex} vpdpbusd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
+; AVXVNNI-AVX-NEXT:    vmovd %xmm1, %eax
+; AVXVNNI-AVX-NEXT:    addl %edi, %eax
+; AVXVNNI-AVX-NEXT:    retq
+;
+; AVXVNNI-AVX512-LABEL: mul_4xi8_zc:
+; AVXVNNI-AVX512:       # %bb.0: # %entry
+; AVXVNNI-AVX512-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVXVNNI-AVX512-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVXVNNI-AVX512-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVXVNNI-AVX512-NEXT:    {vex} vpdpbusd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
+; AVXVNNI-AVX512-NEXT:    vmovd %xmm1, %eax
+; AVXVNNI-AVX512-NEXT:    addl %edi, %eax
+; AVXVNNI-AVX512-NEXT:    retq
 ;
 ; AVX512VNNI-LABEL: mul_4xi8_zc:
 ; AVX512VNNI:       # %bb.0: # %entry
@@ -62,16 +73,26 @@ entry:
 }
 
 define i32 @mul_4xi4_cz(<4 x i4> %a, i32 %c) {
-; AVXVNNI-LABEL: mul_4xi4_cz:
-; AVXVNNI:       # %bb.0: # %entry
-; AVXVNNI-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVXVNNI-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVXVNNI-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVXVNNI-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
-; AVXVNNI-NEXT:    {vex} vpdpbusd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
-; AVXVNNI-NEXT:    vmovd %xmm1, %eax
-; AVXVNNI-NEXT:    addl %edi, %eax
-; AVXVNNI-NEXT:    retq
+; AVXVNNI-AVX-LABEL: mul_4xi4_cz:
+; AVXVNNI-AVX:       # %bb.0: # %entry
+; AVXVNNI-AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVXVNNI-AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVXVNNI-AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVXVNNI-AVX-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVXVNNI-AVX-NEXT:    {vex} vpdpbusd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
+; AVXVNNI-AVX-NEXT:    vmovd %xmm1, %eax
+; AVXVNNI-AVX-NEXT:    addl %edi, %eax
+; AVXVNNI-AVX-NEXT:    retq
+;
+; AVXVNNI-AVX512-LABEL: mul_4xi4_cz:
+; AVXVNNI-AVX512:       # %bb.0: # %entry
+; AVXVNNI-AVX512-NEXT:    vpmovdb %xmm0, %xmm0
+; AVXVNNI-AVX512-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
+; AVXVNNI-AVX512-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVXVNNI-AVX512-NEXT:    {vex} vpdpbusd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
+; AVXVNNI-AVX512-NEXT:    vmovd %xmm1, %eax
+; AVXVNNI-AVX512-NEXT:    addl %edi, %eax
+; AVXVNNI-AVX512-NEXT:    retq
 ;
 ; AVX512VNNI-LABEL: mul_4xi4_cz:
 ; AVX512VNNI:       # %bb.0: # %entry
@@ -104,15 +125,26 @@ entry:
 }
 
 define i32 @mul_4xi8_cs(<4 x i8> %a, i32 %c) {
-; AVXVNNI-LABEL: mul_4xi8_cs:
-; AVXVNNI:       # %bb.0: # %entry
-; AVXVNNI-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVXVNNI-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
-; AVXVNNI-NEXT:    vmovd {{.*#+}} xmm2 = [16,1,2,255,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVXVNNI-NEXT:    {vex} vpdpbusd %xmm0, %xmm2, %xmm1
-; AVXVNNI-NEXT:    vmovd %xmm1, %eax
-; AVXVNNI-NEXT:    addl %edi, %eax
-; AVXVNNI-NEXT:    retq
+; AVXVNNI-AVX-LABEL: mul_4xi8_cs:
+; AVXVNNI-AVX:       # %bb.0: # %entry
+; AVXVNNI-AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVXVNNI-AVX-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVXVNNI-AVX-NEXT:    vmovd {{.*#+}} xmm2 = [16,1,2,255,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVXVNNI-AVX-NEXT:    {vex} vpdpbusd %xmm0, %xmm2, %xmm1
+; AVXVNNI-AVX-NEXT:    vmovd %xmm1, %eax
+; AVXVNNI-AVX-NEXT:    addl %edi, %eax
+; AVXVNNI-AVX-NEXT:    retq
+;
+; AVXVNNI-AVX512-LABEL: mul_4xi8_cs:
+; AVXVNNI-AVX512:       # %bb.0: # %entry
+; AVXVNNI-AVX512-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVXVNNI-AVX512-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVXVNNI-AVX512-NEXT:    vmovd {{.*#+}} xmm1 = [16,1,2,255,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVXVNNI-AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVXVNNI-AVX512-NEXT:    {vex} vpdpbusd %xmm0, %xmm1, %xmm2
+; AVXVNNI-AVX512-NEXT:    vmovd %xmm2, %eax
+; AVXVNNI-AVX512-NEXT:    addl %edi, %eax
+; AVXVNNI-AVX512-NEXT:    retq
 ;
 ; AVX512VNNI-LABEL: mul_4xi8_cs:
 ; AVX512VNNI:       # %bb.0: # %entry
@@ -145,17 +177,17 @@ entry:
 }
 
 define i32 @mul_4xi8_cs_exceed(<4 x i8> %a, i32 %c) {
-; ALL-LABEL: mul_4xi8_cs_exceed:
-; ALL:       # %bb.0: # %entry
-; ALL-NEXT:    vpmovsxbd %xmm0, %xmm0
-; ALL-NEXT:    vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [0,0,1,0,2,0,256,0]
-; ALL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; ALL-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; ALL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; ALL-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
-; ALL-NEXT:    vmovd %xmm0, %eax
-; ALL-NEXT:    addl %edi, %eax
-; ALL-NEXT:    retq
+; CHECK-LABEL: mul_4xi8_cs_exceed:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vpmovsxbd %xmm0, %xmm0
+; CHECK-NEXT:    vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [0,0,1,0,2,0,256,0]
+; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; CHECK-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; CHECK-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    vmovd %xmm0, %eax
+; CHECK-NEXT:    addl %edi, %eax
+; CHECK-NEXT:    retq
 entry:
   %0 = sext <4 x i8> %a to <4 x i32>
   %1 = mul nsw <4 x i32> <i32 0, i32 1, i32 2, i32 256>, %0
@@ -265,24 +297,44 @@ entry:
 }
 
 define i32 @mul_64xi8_zc(<64 x i8> %a, i32 %c) {
-; AVXVNNI-LABEL: mul_64xi8_zc:
-; AVXVNNI:       # %bb.0: # %entry
-; AVXVNNI-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [0,1,2,64,0,1,2,64,0,1,2,64,0,1,2,64,0,1,2,64,0,1,2,64,0,1,2,64,0,1,2,64]
-; AVXVNNI-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; AVXVNNI-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVXVNNI-NEXT:    {vex} vpdpbusd %ymm2, %ymm1, %ymm4
-; AVXVNNI-NEXT:    {vex} vpdpbusd %ymm2, %ymm0, %ymm3
-; AVXVNNI-NEXT:    vpaddd %ymm4, %ymm3, %ymm0
-; AVXVNNI-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVXVNNI-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVXVNNI-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; AVXVNNI-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVXVNNI-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; AVXVNNI-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVXVNNI-NEXT:    vmovd %xmm0, %eax
-; AVXVNNI-NEXT:    addl %edi, %eax
-; AVXVNNI-NEXT:    vzeroupper
-; AVXVNNI-NEXT:    retq
+; AVXVNNI-AVX-LABEL: mul_64xi8_zc:
+; AVXVNNI-AVX:       # %bb.0: # %entry
+; AVXVNNI-AVX-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [0,1,2,64,0,1,2,64,0,1,2,64,0,1,2,64,0,1,2,64,0,1,2,64,0,1,2,64,0,1,2,64]
+; AVXVNNI-AVX-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVXVNNI-AVX-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVXVNNI-AVX-NEXT:    {vex} vpdpbusd %ymm2, %ymm1, %ymm4
+; AVXVNNI-AVX-NEXT:    {vex} vpdpbusd %ymm2, %ymm0, %ymm3
+; AVXVNNI-AVX-NEXT:    vpaddd %ymm4, %ymm3, %ymm0
+; AVXVNNI-AVX-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVXVNNI-AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVXVNNI-AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVXVNNI-AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVXVNNI-AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; AVXVNNI-AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVXVNNI-AVX-NEXT:    vmovd %xmm0, %eax
+; AVXVNNI-AVX-NEXT:    addl %edi, %eax
+; AVXVNNI-AVX-NEXT:    vzeroupper
+; AVXVNNI-AVX-NEXT:    retq
+;
+; AVXVNNI-AVX512-LABEL: mul_64xi8_zc:
+; AVXVNNI-AVX512:       # %bb.0: # %entry
+; AVXVNNI-AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVXVNNI-AVX512-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [0,1,2,64,0,1,2,64,0,1,2,64,0,1,2,64,0,1,2,64,0,1,2,64,0,1,2,64,0,1,2,64]
+; AVXVNNI-AVX512-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVXVNNI-AVX512-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVXVNNI-AVX512-NEXT:    {vex} vpdpbusd %ymm2, %ymm1, %ymm4
+; AVXVNNI-AVX512-NEXT:    {vex} vpdpbusd %ymm2, %ymm0, %ymm3
+; AVXVNNI-AVX512-NEXT:    vpaddd %ymm4, %ymm3, %ymm0
+; AVXVNNI-AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVXVNNI-AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVXVNNI-AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVXVNNI-AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVXVNNI-AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; AVXVNNI-AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVXVNNI-AVX512-NEXT:    vmovd %xmm0, %eax
+; AVXVNNI-AVX512-NEXT:    addl %edi, %eax
+; AVXVNNI-AVX512-NEXT:    vzeroupper
+; AVXVNNI-AVX512-NEXT:    retq
 ;
 ; AVX512-LABEL: mul_64xi8_zc:
 ; AVX512:       # %bb.0: # %entry
diff --git a/llvm/test/CodeGen/X86/ftrunc.ll b/llvm/test/CodeGen/X86/ftrunc.ll
index 3ed9858..9095fb1 100644
--- a/llvm/test/CodeGen/X86/ftrunc.ll
+++ b/llvm/test/CodeGen/X86/ftrunc.ll
@@ -243,7 +243,7 @@ define <4 x double> @trunc_unsigned_v4f64(<4 x double> %x) #0 {
   ret <4 x double> %r
 }
 
-define float @trunc_signed_f32_no_fast_math(float %x) {
+define float @trunc_signed_f32_no_fast_math(float %x) nounwind {
 ; SSE-LABEL: trunc_signed_f32_no_fast_math:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    cvttps2dq %xmm0, %xmm0
@@ -259,14 +259,12 @@ define float @trunc_signed_f32_no_fast_math(float %x) {
 ; X86-AVX1-LABEL: trunc_signed_f32_no_fast_math:
 ; X86-AVX1:       # %bb.0:
 ; X86-AVX1-NEXT:    pushl %eax
-; X86-AVX1-NEXT:    .cfi_def_cfa_offset 8
 ; X86-AVX1-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X86-AVX1-NEXT:    vcvttps2dq %xmm0, %xmm0
 ; X86-AVX1-NEXT:    vcvtdq2ps %xmm0, %xmm0
 ; X86-AVX1-NEXT:    vmovss %xmm0, (%esp)
 ; X86-AVX1-NEXT:    flds (%esp)
 ; X86-AVX1-NEXT:    popl %eax
-; X86-AVX1-NEXT:    .cfi_def_cfa_offset 4
 ; X86-AVX1-NEXT:    retl
   %i = fptosi float %x to i32
   %r = sitofp i32 %i to float
@@ -306,7 +304,7 @@ define float @trunc_signed_f32_nsz(float %x) #0 {
   ret float %r
 }
 
-define double @trunc_signed32_f64_no_fast_math(double %x) {
+define double @trunc_signed32_f64_no_fast_math(double %x) nounwind {
 ; SSE-LABEL: trunc_signed32_f64_no_fast_math:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    cvttpd2dq %xmm0, %xmm0
@@ -322,10 +320,7 @@ define double @trunc_signed32_f64_no_fast_math(double %x) {
 ; X86-AVX1-LABEL: trunc_signed32_f64_no_fast_math:
 ; X86-AVX1:       # %bb.0:
 ; X86-AVX1-NEXT:    pushl %ebp
-; X86-AVX1-NEXT:    .cfi_def_cfa_offset 8
-; X86-AVX1-NEXT:    .cfi_offset %ebp, -8
 ; X86-AVX1-NEXT:    movl %esp, %ebp
-; X86-AVX1-NEXT:    .cfi_def_cfa_register %ebp
 ; X86-AVX1-NEXT:    andl $-8, %esp
 ; X86-AVX1-NEXT:    subl $8, %esp
 ; X86-AVX1-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
@@ -335,7 +330,6 @@ define double @trunc_signed32_f64_no_fast_math(double %x) {
 ; X86-AVX1-NEXT:    fldl (%esp)
 ; X86-AVX1-NEXT:    movl %ebp, %esp
 ; X86-AVX1-NEXT:    popl %ebp
-; X86-AVX1-NEXT:    .cfi_def_cfa %esp, 4
 ; X86-AVX1-NEXT:    retl
   %i = fptosi double %x to i32
   %r = sitofp i32 %i to double
@@ -377,7 +371,7 @@ define double @trunc_signed32_f64_nsz(double %x) #0 {
   ret double %r
 }
 
-define double @trunc_f32_signed32_f64_no_fast_math(float %x) {
+define double @trunc_f32_signed32_f64_no_fast_math(float %x) nounwind {
 ; SSE-LABEL: trunc_f32_signed32_f64_no_fast_math:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    cvttps2dq %xmm0, %xmm0
@@ -393,10 +387,7 @@ define double @trunc_f32_signed32_f64_no_fast_math(float %x) {
 ; X86-AVX1-LABEL: trunc_f32_signed32_f64_no_fast_math:
 ; X86-AVX1:       # %bb.0:
 ; X86-AVX1-NEXT:    pushl %ebp
-; X86-AVX1-NEXT:    .cfi_def_cfa_offset 8
-; X86-AVX1-NEXT:    .cfi_offset %ebp, -8
 ; X86-AVX1-NEXT:    movl %esp, %ebp
-; X86-AVX1-NEXT:    .cfi_def_cfa_register %ebp
 ; X86-AVX1-NEXT:    andl $-8, %esp
 ; X86-AVX1-NEXT:    subl $8, %esp
 ; X86-AVX1-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
@@ -406,7 +397,6 @@ define double @trunc_f32_signed32_f64_no_fast_math(float %x) {
 ; X86-AVX1-NEXT:    fldl (%esp)
 ; X86-AVX1-NEXT:    movl %ebp, %esp
 ; X86-AVX1-NEXT:    popl %ebp
-; X86-AVX1-NEXT:    .cfi_def_cfa %esp, 4
 ; X86-AVX1-NEXT:    retl
   %i = fptosi float %x to i32
   %r = sitofp i32 %i to double
@@ -445,7 +435,7 @@ define double @trunc_f32_signed32_f64_nsz(float %x) #0 {
   ret double %r
 }
 
-define float @trunc_f64_signed32_f32_no_fast_math(double %x) {
+define float @trunc_f64_signed32_f32_no_fast_math(double %x) nounwind {
 ; SSE-LABEL: trunc_f64_signed32_f32_no_fast_math:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    cvttpd2dq %xmm0, %xmm0
@@ -461,14 +451,12 @@ define float @trunc_f64_signed32_f32_no_fast_math(double %x) {
 ; X86-AVX1-LABEL: trunc_f64_signed32_f32_no_fast_math:
 ; X86-AVX1:       # %bb.0:
 ; X86-AVX1-NEXT:    pushl %eax
-; X86-AVX1-NEXT:    .cfi_def_cfa_offset 8
 ; X86-AVX1-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
 ; X86-AVX1-NEXT:    vcvttpd2dq %xmm0, %xmm0
 ; X86-AVX1-NEXT:    vcvtdq2ps %xmm0, %xmm0
 ; X86-AVX1-NEXT:    vmovss %xmm0, (%esp)
 ; X86-AVX1-NEXT:    flds (%esp)
 ; X86-AVX1-NEXT:    popl %eax
-; X86-AVX1-NEXT:    .cfi_def_cfa_offset 4
 ; X86-AVX1-NEXT:    retl
   %i = fptosi double %x to i32
   %r = sitofp i32 %i to float
@@ -503,7 +491,7 @@ define float @trunc_f64_signed32_f32_nsz(double %x) #0 {
   ret float %r
 }
 
-define double @trunc_signed_f64_no_fast_math(double %x) {
+define double @trunc_signed_f64_no_fast_math(double %x) nounwind {
 ; SSE-LABEL: trunc_signed_f64_no_fast_math:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    cvttsd2si %xmm0, %rax
@@ -520,10 +508,7 @@ define double @trunc_signed_f64_no_fast_math(double %x) {
 ; X86-AVX1-LABEL: trunc_signed_f64_no_fast_math:
 ; X86-AVX1:       # %bb.0:
 ; X86-AVX1-NEXT:    pushl %ebp
-; X86-AVX1-NEXT:    .cfi_def_cfa_offset 8
-; X86-AVX1-NEXT:    .cfi_offset %ebp, -8
 ; X86-AVX1-NEXT:    movl %esp, %ebp
-; X86-AVX1-NEXT:    .cfi_def_cfa_register %ebp
 ; X86-AVX1-NEXT:    andl $-8, %esp
 ; X86-AVX1-NEXT:    subl $24, %esp
 ; X86-AVX1-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
@@ -537,7 +522,6 @@ define double @trunc_signed_f64_no_fast_math(double %x) {
 ; X86-AVX1-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X86-AVX1-NEXT:    movl %ebp, %esp
 ; X86-AVX1-NEXT:    popl %ebp
-; X86-AVX1-NEXT:    .cfi_def_cfa %esp, 4
 ; X86-AVX1-NEXT:    retl
   %i = fptosi double %x to i64
   %r = sitofp i64 %i to double
diff --git a/llvm/test/CodeGen/X86/isint.ll b/llvm/test/CodeGen/X86/isint.ll
index 8a56f49..8c11fe1 100644
--- a/llvm/test/CodeGen/X86/isint.ll
+++ b/llvm/test/CodeGen/X86/isint.ll
@@ -1,29 +1,29 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-pc-unknown -mattr=+sse2 | FileCheck -check-prefix=CHECK64 %s
-; RUN: llc < %s -mtriple=i686-pc-unknown -mattr=+sse2 | FileCheck -check-prefix=CHECK32 %s
+; RUN: llc < %s -mtriple=x86_64-pc-unknown -mattr=+sse2 | FileCheck -check-prefix=X64 %s
+; RUN: llc < %s -mtriple=i686-pc-unknown -mattr=+sse2 | FileCheck -check-prefix=X86 %s
 
 ; PR19059
 
 define i32 @isint_return(double %d) nounwind {
-; CHECK64-LABEL: isint_return:
-; CHECK64:       # %bb.0:
-; CHECK64-NEXT:    cvttpd2dq %xmm0, %xmm1
-; CHECK64-NEXT:    cvtdq2pd %xmm1, %xmm1
-; CHECK64-NEXT:    cmpeqsd %xmm0, %xmm1
-; CHECK64-NEXT:    movq %xmm1, %rax
-; CHECK64-NEXT:    andl $1, %eax
-; CHECK64-NEXT:    # kill: def $eax killed $eax killed $rax
-; CHECK64-NEXT:    retq
+; X64-LABEL: isint_return:
+; X64:       # %bb.0:
+; X64-NEXT:    cvttpd2dq %xmm0, %xmm1
+; X64-NEXT:    cvtdq2pd %xmm1, %xmm1
+; X64-NEXT:    cmpeqsd %xmm0, %xmm1
+; X64-NEXT:    movq %xmm1, %rax
+; X64-NEXT:    andl $1, %eax
+; X64-NEXT:    # kill: def $eax killed $eax killed $rax
+; X64-NEXT:    retq
 ;
-; CHECK32-LABEL: isint_return:
-; CHECK32:       # %bb.0:
-; CHECK32-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; CHECK32-NEXT:    cvttpd2dq %xmm0, %xmm1
-; CHECK32-NEXT:    cvtdq2pd %xmm1, %xmm1
-; CHECK32-NEXT:    cmpeqsd %xmm0, %xmm1
-; CHECK32-NEXT:    movd %xmm1, %eax
-; CHECK32-NEXT:    andl $1, %eax
-; CHECK32-NEXT:    retl
+; X86-LABEL: isint_return:
+; X86:       # %bb.0:
+; X86-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X86-NEXT:    cvttpd2dq %xmm0, %xmm1
+; X86-NEXT:    cvtdq2pd %xmm1, %xmm1
+; X86-NEXT:    cmpeqsd %xmm0, %xmm1
+; X86-NEXT:    movd %xmm1, %eax
+; X86-NEXT:    andl $1, %eax
+; X86-NEXT:    retl
   %i = fptosi double %d to i32
   %e = sitofp i32 %i to double
   %c = fcmp oeq double %d, %e
@@ -32,24 +32,24 @@ define i32 @isint_return(double %d) nounwind {
 }
 
 define i32 @isint_float_return(float %f) nounwind {
-; CHECK64-LABEL: isint_float_return:
-; CHECK64:       # %bb.0:
-; CHECK64-NEXT:    cvttps2dq %xmm0, %xmm1
-; CHECK64-NEXT:    cvtdq2ps %xmm1, %xmm1
-; CHECK64-NEXT:    cmpeqss %xmm0, %xmm1
-; CHECK64-NEXT:    movd %xmm1, %eax
-; CHECK64-NEXT:    andl $1, %eax
-; CHECK64-NEXT:    retq
+; X64-LABEL: isint_float_return:
+; X64:       # %bb.0:
+; X64-NEXT:    cvttps2dq %xmm0, %xmm1
+; X64-NEXT:    cvtdq2ps %xmm1, %xmm1
+; X64-NEXT:    cmpeqss %xmm0, %xmm1
+; X64-NEXT:    movd %xmm1, %eax
+; X64-NEXT:    andl $1, %eax
+; X64-NEXT:    retq
 ;
-; CHECK32-LABEL: isint_float_return:
-; CHECK32:       # %bb.0:
-; CHECK32-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; CHECK32-NEXT:    cvttps2dq %xmm0, %xmm1
-; CHECK32-NEXT:    cvtdq2ps %xmm1, %xmm1
-; CHECK32-NEXT:    cmpeqss %xmm0, %xmm1
-; CHECK32-NEXT:    movd %xmm1, %eax
-; CHECK32-NEXT:    andl $1, %eax
-; CHECK32-NEXT:    retl
+; X86-LABEL: isint_float_return:
+; X86:       # %bb.0:
+; X86-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT:    cvttps2dq %xmm0, %xmm1
+; X86-NEXT:    cvtdq2ps %xmm1, %xmm1
+; X86-NEXT:    cmpeqss %xmm0, %xmm1
+; X86-NEXT:    movd %xmm1, %eax
+; X86-NEXT:    andl $1, %eax
+; X86-NEXT:    retl
   %i = fptosi float %f to i32
   %g = sitofp i32 %i to float
   %c = fcmp oeq float %f, %g
@@ -60,32 +60,32 @@ define i32 @isint_float_return(float %f) nounwind {
 declare void @foo()
 
 define void @isint_branch(double %d) nounwind {
-; CHECK64-LABEL: isint_branch:
-; CHECK64:       # %bb.0:
-; CHECK64-NEXT:    cvttpd2dq %xmm0, %xmm1
-; CHECK64-NEXT:    cvtdq2pd %xmm1, %xmm1
-; CHECK64-NEXT:    ucomisd %xmm1, %xmm0
-; CHECK64-NEXT:    jne .LBB2_2
-; CHECK64-NEXT:    jp .LBB2_2
-; CHECK64-NEXT:  # %bb.1: # %true
-; CHECK64-NEXT:    pushq %rax
-; CHECK64-NEXT:    callq foo@PLT
-; CHECK64-NEXT:    popq %rax
-; CHECK64-NEXT:  .LBB2_2: # %false
-; CHECK64-NEXT:    retq
+; X64-LABEL: isint_branch:
+; X64:       # %bb.0:
+; X64-NEXT:    cvttpd2dq %xmm0, %xmm1
+; X64-NEXT:    cvtdq2pd %xmm1, %xmm1
+; X64-NEXT:    ucomisd %xmm1, %xmm0
+; X64-NEXT:    jne .LBB2_2
+; X64-NEXT:    jp .LBB2_2
+; X64-NEXT:  # %bb.1: # %true
+; X64-NEXT:    pushq %rax
+; X64-NEXT:    callq foo@PLT
+; X64-NEXT:    popq %rax
+; X64-NEXT:  .LBB2_2: # %false
+; X64-NEXT:    retq
 ;
-; CHECK32-LABEL: isint_branch:
-; CHECK32:       # %bb.0:
-; CHECK32-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; CHECK32-NEXT:    cvttpd2dq %xmm0, %xmm1
-; CHECK32-NEXT:    cvtdq2pd %xmm1, %xmm1
-; CHECK32-NEXT:    ucomisd %xmm1, %xmm0
-; CHECK32-NEXT:    jne .LBB2_2
-; CHECK32-NEXT:    jp .LBB2_2
-; CHECK32-NEXT:  # %bb.1: # %true
-; CHECK32-NEXT:    calll foo@PLT
-; CHECK32-NEXT:  .LBB2_2: # %false
-; CHECK32-NEXT:    retl
+; X86-LABEL: isint_branch:
+; X86:       # %bb.0:
+; X86-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X86-NEXT:    cvttpd2dq %xmm0, %xmm1
+; X86-NEXT:    cvtdq2pd %xmm1, %xmm1
+; X86-NEXT:    ucomisd %xmm1, %xmm0
+; X86-NEXT:    jne .LBB2_2
+; X86-NEXT:    jp .LBB2_2
+; X86-NEXT:  # %bb.1: # %true
+; X86-NEXT:    calll foo@PLT
+; X86-NEXT:  .LBB2_2: # %false
+; X86-NEXT:    retl
   %i = fptosi double %d to i32
   %e = sitofp i32 %i to double
   %c = fcmp oeq double %d, %e
diff --git a/llvm/test/CodeGen/X86/known-signbits-shl.ll b/llvm/test/CodeGen/X86/known-signbits-shl.ll
index 473fecc..57d557d 100644
--- a/llvm/test/CodeGen/X86/known-signbits-shl.ll
+++ b/llvm/test/CodeGen/X86/known-signbits-shl.ll
@@ -137,7 +137,7 @@ define void @computeNumSignBits_shl_zext_vec_3(<2 x i8> %x, ptr %p) nounwind {
 ; X64-NEXT:    pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; X64-NEXT:    por %xmm2, %xmm1
 ; X64-NEXT:    movdqa %xmm0, %xmm2
-; X64-NEXT:    paddw %xmm0, %xmm2
+; X64-NEXT:    paddw %xmm2, %xmm2
 ; X64-NEXT:    movdqa %xmm2, %xmm3
 ; X64-NEXT:    psraw $1, %xmm3
 ; X64-NEXT:    pcmpeqw %xmm0, %xmm3
diff --git a/llvm/test/CodeGen/X86/lea-16bit.ll b/llvm/test/CodeGen/X86/lea-16bit.ll
index cec29ab..40da01d 100644
--- a/llvm/test/CodeGen/X86/lea-16bit.ll
+++ b/llvm/test/CodeGen/X86/lea-16bit.ll
@@ -13,7 +13,8 @@ define i16 @lea16bit(i16 %in) {
 ; NDD-LABEL: lea16bit:
 ; NDD:       # %bb.0:
 ; NDD-NEXT:    # kill: def $edi killed $edi def $rdi
-; NDD-NEXT:    leaw 1(%rdi,%rdi), %ax
+; NDD-NEXT:    leal 1(%rdi,%rdi), %eax
+; NDD-NEXT:    # kill: def $ax killed $ax killed $eax
 ; NDD-NEXT:    retq
   %shl = shl i16 %in, 1
   %or = or i16 %shl, 1
diff --git a/llvm/test/CodeGen/X86/lea-8bit.ll b/llvm/test/CodeGen/X86/lea-8bit.ll
index 98222df..fc295f7 100644
--- a/llvm/test/CodeGen/X86/lea-8bit.ll
+++ b/llvm/test/CodeGen/X86/lea-8bit.ll
@@ -14,7 +14,8 @@ define i8 @lea8bit(i8 %in) {
 ; NDD-LABEL: lea8bit:
 ; NDD:       # %bb.0:
 ; NDD-NEXT:    # kill: def $edi killed $edi def $rdi
-; NDD-NEXT:    leab 1(%rdi,%rdi), %al
+; NDD-NEXT:    leal 1(%rdi,%rdi), %eax
+; NDD-NEXT:    # kill: def $al killed $al killed $eax
 ; NDD-NEXT:    retq
   %shl = shl i8 %in, 1
   %or = or i8 %shl, 1
diff --git a/llvm/test/CodeGen/X86/masked_gather_scatter.ll b/llvm/test/CodeGen/X86/masked_gather_scatter.ll
index 4e6f666..4cde581 100644
--- a/llvm/test/CodeGen/X86/masked_gather_scatter.ll
+++ b/llvm/test/CodeGen/X86/masked_gather_scatter.ll
@@ -4806,9 +4806,8 @@ define <16 x float> @test_gather_structpt_16f32_mask_index(ptr %x, ptr %arr, <16
 ; X64-KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; X64-KNL-NEXT:    vpslld $31, %zmm0, %zmm0
 ; X64-KNL-NEXT:    vptestmd %zmm0, %zmm0, %k1
-; X64-KNL-NEXT:    vmovdqu64 (%rsi), %zmm0
+; X64-KNL-NEXT:    vpslld $1, (%rsi), %zmm0
 ; X64-KNL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
-; X64-KNL-NEXT:    vpaddd %zmm0, %zmm0, %zmm0
 ; X64-KNL-NEXT:    vgatherdps (%rdi,%zmm0,8), %zmm1 {%k1}
 ; X64-KNL-NEXT:    vmovaps %zmm1, %zmm0
 ; X64-KNL-NEXT:    retq
@@ -4830,9 +4829,8 @@ define <16 x float> @test_gather_structpt_16f32_mask_index(ptr %x, ptr %arr, <16
 ; X64-SKX-SMALL-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; X64-SKX-SMALL-NEXT:    vpslld $31, %zmm0, %zmm0
 ; X64-SKX-SMALL-NEXT:    vpmovd2m %zmm0, %k1
-; X64-SKX-SMALL-NEXT:    vmovdqu64 (%rsi), %zmm0
+; X64-SKX-SMALL-NEXT:    vpslld $1, (%rsi), %zmm0
 ; X64-SKX-SMALL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
-; X64-SKX-SMALL-NEXT:    vpaddd %zmm0, %zmm0, %zmm0
 ; X64-SKX-SMALL-NEXT:    vgatherdps (%rdi,%zmm0,8), %zmm1 {%k1}
 ; X64-SKX-SMALL-NEXT:    vmovaps %zmm1, %zmm0
 ; X64-SKX-SMALL-NEXT:    retq
@@ -4842,10 +4840,9 @@ define <16 x float> @test_gather_structpt_16f32_mask_index(ptr %x, ptr %arr, <16
 ; X64-SKX-LARGE-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; X64-SKX-LARGE-NEXT:    vpslld $31, %zmm0, %zmm0
 ; X64-SKX-LARGE-NEXT:    vpmovd2m %zmm0, %k1
-; X64-SKX-LARGE-NEXT:    vmovdqu64 (%rsi), %zmm0
+; X64-SKX-LARGE-NEXT:    vpslld $1, (%rsi), %zmm0
 ; X64-SKX-LARGE-NEXT:    movabsq ${{\.?LCPI[0-9]+_[0-9]+}}, %rax
 ; X64-SKX-LARGE-NEXT:    vpandd (%rax){1to16}, %zmm0, %zmm0
-; X64-SKX-LARGE-NEXT:    vpaddd %zmm0, %zmm0, %zmm0
 ; X64-SKX-LARGE-NEXT:    vgatherdps (%rdi,%zmm0,8), %zmm1 {%k1}
 ; X64-SKX-LARGE-NEXT:    vmovaps %zmm1, %zmm0
 ; X64-SKX-LARGE-NEXT:    retq
@@ -4875,9 +4872,8 @@ define <16 x float> @test_gather_structpt_16f32_mask_index_offset(ptr %x, ptr %a
 ; X64-KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; X64-KNL-NEXT:    vpslld $31, %zmm0, %zmm0
 ; X64-KNL-NEXT:    vptestmd %zmm0, %zmm0, %k1
-; X64-KNL-NEXT:    vmovdqu64 (%rsi), %zmm0
+; X64-KNL-NEXT:    vpslld $1, (%rsi), %zmm0
 ; X64-KNL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
-; X64-KNL-NEXT:    vpaddd %zmm0, %zmm0, %zmm0
 ; X64-KNL-NEXT:    vgatherdps 4(%rdi,%zmm0,8), %zmm1 {%k1}
 ; X64-KNL-NEXT:    vmovaps %zmm1, %zmm0
 ; X64-KNL-NEXT:    retq
@@ -4899,9 +4895,8 @@ define <16 x float> @test_gather_structpt_16f32_mask_index_offset(ptr %x, ptr %a
 ; X64-SKX-SMALL-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; X64-SKX-SMALL-NEXT:    vpslld $31, %zmm0, %zmm0
 ; X64-SKX-SMALL-NEXT:    vpmovd2m %zmm0, %k1
-; X64-SKX-SMALL-NEXT:    vmovdqu64 (%rsi), %zmm0
+; X64-SKX-SMALL-NEXT:    vpslld $1, (%rsi), %zmm0
 ; X64-SKX-SMALL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
-; X64-SKX-SMALL-NEXT:    vpaddd %zmm0, %zmm0, %zmm0
 ; X64-SKX-SMALL-NEXT:    vgatherdps 4(%rdi,%zmm0,8), %zmm1 {%k1}
 ; X64-SKX-SMALL-NEXT:    vmovaps %zmm1, %zmm0
 ; X64-SKX-SMALL-NEXT:    retq
@@ -4911,10 +4906,9 @@ define <16 x float> @test_gather_structpt_16f32_mask_index_offset(ptr %x, ptr %a
 ; X64-SKX-LARGE-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; X64-SKX-LARGE-NEXT:    vpslld $31, %zmm0, %zmm0
 ; X64-SKX-LARGE-NEXT:    vpmovd2m %zmm0, %k1
-; X64-SKX-LARGE-NEXT:    vmovdqu64 (%rsi), %zmm0
+; X64-SKX-LARGE-NEXT:    vpslld $1, (%rsi), %zmm0
 ; X64-SKX-LARGE-NEXT:    movabsq ${{\.?LCPI[0-9]+_[0-9]+}}, %rax
 ; X64-SKX-LARGE-NEXT:    vpandd (%rax){1to16}, %zmm0, %zmm0
-; X64-SKX-LARGE-NEXT:    vpaddd %zmm0, %zmm0, %zmm0
 ; X64-SKX-LARGE-NEXT:    vgatherdps 4(%rdi,%zmm0,8), %zmm1 {%k1}
 ; X64-SKX-LARGE-NEXT:    vmovaps %zmm1, %zmm0
 ; X64-SKX-LARGE-NEXT:    retq
@@ -4944,9 +4938,8 @@ define {<16 x float>, <16 x float>} @test_gather_structpt_16f32_mask_index_pair(
 ; X64-KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; X64-KNL-NEXT:    vpslld $31, %zmm0, %zmm0
 ; X64-KNL-NEXT:    vptestmd %zmm0, %zmm0, %k1
-; X64-KNL-NEXT:    vmovdqu64 (%rsi), %zmm0
-; X64-KNL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
-; X64-KNL-NEXT:    vpaddd %zmm0, %zmm0, %zmm2
+; X64-KNL-NEXT:    vpslld $1, (%rsi), %zmm0
+; X64-KNL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm2
 ; X64-KNL-NEXT:    kmovw %k1, %k2
 ; X64-KNL-NEXT:    vmovaps %zmm1, %zmm0
 ; X64-KNL-NEXT:    vgatherdps (%rdi,%zmm2,8), %zmm0 {%k2}
@@ -4972,9 +4965,8 @@ define {<16 x float>, <16 x float>} @test_gather_structpt_16f32_mask_index_pair(
 ; X64-SKX-SMALL-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; X64-SKX-SMALL-NEXT:    vpslld $31, %zmm0, %zmm0
 ; X64-SKX-SMALL-NEXT:    vpmovd2m %zmm0, %k1
-; X64-SKX-SMALL-NEXT:    vmovdqu64 (%rsi), %zmm0
-; X64-SKX-SMALL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
-; X64-SKX-SMALL-NEXT:    vpaddd %zmm0, %zmm0, %zmm2
+; X64-SKX-SMALL-NEXT:    vpslld $1, (%rsi), %zmm0
+; X64-SKX-SMALL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm2
 ; X64-SKX-SMALL-NEXT:    kmovw %k1, %k2
 ; X64-SKX-SMALL-NEXT:    vmovaps %zmm1, %zmm0
 ; X64-SKX-SMALL-NEXT:    vgatherdps (%rdi,%zmm2,8), %zmm0 {%k2}
@@ -4986,10 +4978,9 @@ define {<16 x float>, <16 x float>} @test_gather_structpt_16f32_mask_index_pair(
 ; X64-SKX-LARGE-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; X64-SKX-LARGE-NEXT:    vpslld $31, %zmm0, %zmm0
 ; X64-SKX-LARGE-NEXT:    vpmovd2m %zmm0, %k1
-; X64-SKX-LARGE-NEXT:    vmovdqu64 (%rsi), %zmm0
+; X64-SKX-LARGE-NEXT:    vpslld $1, (%rsi), %zmm0
 ; X64-SKX-LARGE-NEXT:    movabsq ${{\.?LCPI[0-9]+_[0-9]+}}, %rax
-; X64-SKX-LARGE-NEXT:    vpandd (%rax){1to16}, %zmm0, %zmm0
-; X64-SKX-LARGE-NEXT:    vpaddd %zmm0, %zmm0, %zmm2
+; X64-SKX-LARGE-NEXT:    vpandd (%rax){1to16}, %zmm0, %zmm2
 ; X64-SKX-LARGE-NEXT:    kmovw %k1, %k2
 ; X64-SKX-LARGE-NEXT:    vmovaps %zmm1, %zmm0
 ; X64-SKX-LARGE-NEXT:    vgatherdps (%rdi,%zmm2,8), %zmm0 {%k2}
diff --git a/llvm/test/CodeGen/X86/negative-sin.ll b/llvm/test/CodeGen/X86/negative-sin.ll
index f24507d..4836da2 100644
--- a/llvm/test/CodeGen/X86/negative-sin.ll
+++ b/llvm/test/CodeGen/X86/negative-sin.ll
@@ -82,18 +82,13 @@ define double @semi_strict2(double %e) nounwind {
   ret double %h
 }
 
-; FIXME:
-; Auto-upgrade function attribute to IR-level fast-math-flags.
-
-define double @fn_attr(double %e) nounwind #0 {
-; CHECK-LABEL: fn_attr:
+define double @nsz_flag(double %e) nounwind {
+; CHECK-LABEL: nsz_flag:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    jmp sin@PLT # TAILCALL
-  %f = fsub double 0.0, %e
-  %g = call double @sin(double %f) readonly
-  %h = fsub double 0.0, %g
+  %f = fsub nsz double 0.0, %e
+  %g = call nsz double @sin(double %f) readonly
+  %h = fsub nsz double 0.0, %g
   ret double %h
 }
 
-attributes #0 = { "unsafe-fp-math"="true" "no-signed-zeros-fp-math"="true" }
-
diff --git a/llvm/test/CodeGen/X86/oddsubvector.ll b/llvm/test/CodeGen/X86/oddsubvector.ll
index f539830..5df1867 100644
--- a/llvm/test/CodeGen/X86/oddsubvector.ll
+++ b/llvm/test/CodeGen/X86/oddsubvector.ll
@@ -155,10 +155,10 @@ define <16 x i32> @PR42819(ptr %a0) {
 define void @PR42833() {
 ; SSE2-LABEL: PR42833:
 ; SSE2:       # %bb.0:
+; SSE2-NEXT:    movl b(%rip), %eax
 ; SSE2-NEXT:    movdqa c+144(%rip), %xmm2
 ; SSE2-NEXT:    movdqa c+128(%rip), %xmm0
-; SSE2-NEXT:    movd %xmm0, %eax
-; SSE2-NEXT:    addl b(%rip), %eax
+; SSE2-NEXT:    addl c+128(%rip), %eax
 ; SSE2-NEXT:    movd %eax, %xmm1
 ; SSE2-NEXT:    movd %eax, %xmm3
 ; SSE2-NEXT:    paddd %xmm0, %xmm3
@@ -166,7 +166,7 @@ define void @PR42833() {
 ; SSE2-NEXT:    psubd %xmm2, %xmm4
 ; SSE2-NEXT:    paddd %xmm2, %xmm2
 ; SSE2-NEXT:    movdqa %xmm0, %xmm5
-; SSE2-NEXT:    paddd %xmm0, %xmm5
+; SSE2-NEXT:    paddd %xmm5, %xmm5
 ; SSE2-NEXT:    movss {{.*#+}} xmm5 = xmm3[0],xmm5[1,2,3]
 ; SSE2-NEXT:    movdqa %xmm2, c+144(%rip)
 ; SSE2-NEXT:    movaps %xmm5, c+128(%rip)
@@ -191,17 +191,17 @@ define void @PR42833() {
 ;
 ; SSE42-LABEL: PR42833:
 ; SSE42:       # %bb.0:
+; SSE42-NEXT:    movl b(%rip), %eax
 ; SSE42-NEXT:    movdqa c+144(%rip), %xmm1
 ; SSE42-NEXT:    movdqa c+128(%rip), %xmm0
-; SSE42-NEXT:    movd %xmm0, %eax
-; SSE42-NEXT:    addl b(%rip), %eax
+; SSE42-NEXT:    addl c+128(%rip), %eax
 ; SSE42-NEXT:    movd %eax, %xmm2
 ; SSE42-NEXT:    paddd %xmm0, %xmm2
 ; SSE42-NEXT:    movdqa d+144(%rip), %xmm3
 ; SSE42-NEXT:    psubd %xmm1, %xmm3
 ; SSE42-NEXT:    paddd %xmm1, %xmm1
 ; SSE42-NEXT:    movdqa %xmm0, %xmm4
-; SSE42-NEXT:    paddd %xmm0, %xmm4
+; SSE42-NEXT:    paddd %xmm4, %xmm4
 ; SSE42-NEXT:    pblendw {{.*#+}} xmm4 = xmm2[0,1],xmm4[2,3,4,5,6,7]
 ; SSE42-NEXT:    movdqa %xmm1, c+144(%rip)
 ; SSE42-NEXT:    movdqa %xmm4, c+128(%rip)
diff --git a/llvm/test/CodeGen/X86/pr62286.ll b/llvm/test/CodeGen/X86/pr62286.ll
index ce03f8f..161e965 100644
--- a/llvm/test/CodeGen/X86/pr62286.ll
+++ b/llvm/test/CodeGen/X86/pr62286.ll
@@ -26,27 +26,33 @@ define i64 @PR62286(i32 %a) {
 ; AVX1-LABEL: PR62286:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vmovd %edi, %xmm0
-; AVX1-NEXT:    vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3]
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[0,0,0,0]
+; AVX1-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6,7]
 ; AVX1-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
-; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7]
-; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vorps %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm1
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
 ; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm0
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
-; AVX1-NEXT:    vpmovsxdq %xmm1, %xmm1
-; AVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; AVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vmovq %xmm0, %rax
+; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: PR62286:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vmovd %edi, %xmm0
-; AVX2-NEXT:    vpaddd %xmm0, %xmm0, %xmm1
-; AVX2-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3]
-; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,0]
+; AVX2-NEXT:    vpaddd %ymm0, %ymm0, %ymm1
+; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7]
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vpmovsxdq %xmm0, %ymm0
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
@@ -59,12 +65,12 @@ define i64 @PR62286(i32 %a) {
 ; AVX512-LABEL: PR62286:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vmovd %edi, %xmm0
-; AVX512-NEXT:    movb $8, %al
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,0]
+; AVX512-NEXT:    vpaddd %ymm0, %ymm0, %ymm1
+; AVX512-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
+; AVX512-NEXT:    movw $4369, %ax # imm = 0x1111
 ; AVX512-NEXT:    kmovd %eax, %k1
-; AVX512-NEXT:    vpexpandd %ymm0, %ymm1 {%k1} {z}
-; AVX512-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; AVX512-NEXT:    vpaddd %ymm0, %ymm0, %ymm0
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
+; AVX512-NEXT:    vmovdqa32 %zmm1, %zmm0 {%k1}
 ; AVX512-NEXT:    vpmovsxdq %ymm0, %zmm0
 ; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
 ; AVX512-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
diff --git a/llvm/test/CodeGen/X86/pr74736.ll b/llvm/test/CodeGen/X86/pr74736.ll
index ceccee0..5895526 100644
--- a/llvm/test/CodeGen/X86/pr74736.ll
+++ b/llvm/test/CodeGen/X86/pr74736.ll
@@ -6,8 +6,8 @@ define void @main(<16 x i32> %0, i32 %1) {
 ; SSE-LABEL: main:
 ; SSE:       # %bb.0: # %entry
 ; SSE-NEXT:    movd %edi, %xmm4
-; SSE-NEXT:    movss {{.*#+}} xmm0 = [1,0,0,0]
-; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm4[1,0]
+; SSE-NEXT:    movsd {{.*#+}} xmm0 = [0,1,0,0]
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,0]
 ; SSE-NEXT:    paddd %xmm0, %xmm0
 ; SSE-NEXT:    paddd %xmm1, %xmm1
 ; SSE-NEXT:    paddd %xmm3, %xmm3
@@ -32,20 +32,20 @@ define void @main(<16 x i32> %0, i32 %1) {
 ; AVX-LABEL: main:
 ; AVX:       # %bb.0: # %entry
 ; AVX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0],xmm0[1,2,3]
 ; AVX-NEXT:    movl $1, %eax
 ; AVX-NEXT:    vpinsrd $1, %eax, %xmm2, %xmm2
 ; AVX-NEXT:    vpinsrd $3, %edi, %xmm2, %xmm2
-; AVX-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
-; AVX-NEXT:    vpaddd %ymm0, %ymm0, %ymm0
-; AVX-NEXT:    vpaddd %ymm1, %ymm1, %ymm1
-; AVX-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [0,1,1,3,3,5,5,7]
-; AVX-NEXT:    vpermd %ymm0, %ymm2, %ymm2
+; AVX-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4,5,6,7]
+; AVX-NEXT:    vpaddd %ymm2, %ymm2, %ymm2
+; AVX-NEXT:    vpaddd %ymm1, %ymm1, %ymm3
 ; AVX-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
 ; AVX-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[3,3,3,3,7,7,7,7]
-; AVX-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,1,1,3,4,5,5,7]
+; AVX-NEXT:    vpaddd %ymm0, %ymm0, %ymm0
+; AVX-NEXT:    vpshufd {{.*#+}} ymm1 = ymm3[0,1,1,3,4,5,5,7]
 ; AVX-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
-; AVX-NEXT:    vpxor %ymm0, %ymm2, %ymm0
+; AVX-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [0,1,1,3,3,5,5,7]
+; AVX-NEXT:    vpermd %ymm2, %ymm1, %ymm1
+; AVX-NEXT:    vpxor %ymm0, %ymm1, %ymm0
 ; AVX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
diff --git a/llvm/test/CodeGen/X86/setoeq.ll b/llvm/test/CodeGen/X86/setoeq.ll
index f0addf4..131e279 100644
--- a/llvm/test/CodeGen/X86/setoeq.ll
+++ b/llvm/test/CodeGen/X86/setoeq.ll
@@ -1,40 +1,532 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i686-- -mattr=+sse2 | FileCheck %s
-
-define zeroext i8 @t(double %x) nounwind readnone {
-; CHECK-LABEL: t:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; CHECK-NEXT:    cvttpd2dq %xmm0, %xmm1
-; CHECK-NEXT:    cvtdq2pd %xmm1, %xmm1
-; CHECK-NEXT:    cmpeqsd %xmm0, %xmm1
-; CHECK-NEXT:    movd %xmm1, %eax
-; CHECK-NEXT:    andl $1, %eax
-; CHECK-NEXT:    # kill: def $al killed $al killed $eax
-; CHECK-NEXT:    retl
-entry:
-	%0 = fptosi double %x to i32		; <i32> [#uses=1]
-	%1 = sitofp i32 %0 to double		; <double> [#uses=1]
-	%2 = fcmp oeq double %1, %x		; <i1> [#uses=1]
-	%retval12 = zext i1 %2 to i8		; <i8> [#uses=1]
-	ret i8 %retval12
-}
-
-define zeroext i8 @u(double %x) nounwind readnone {
-; CHECK-LABEL: u:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; CHECK-NEXT:    cvttpd2dq %xmm0, %xmm1
-; CHECK-NEXT:    cvtdq2pd %xmm1, %xmm1
-; CHECK-NEXT:    cmpneqsd %xmm0, %xmm1
-; CHECK-NEXT:    movd %xmm1, %eax
-; CHECK-NEXT:    andl $1, %eax
-; CHECK-NEXT:    # kill: def $al killed $al killed $eax
-; CHECK-NEXT:    retl
-entry:
-	%0 = fptosi double %x to i32		; <i32> [#uses=1]
-	%1 = sitofp i32 %0 to double		; <double> [#uses=1]
-	%2 = fcmp une double %1, %x		; <i1> [#uses=1]
-	%retval12 = zext i1 %2 to i8		; <i8> [#uses=1]
+; RUN: llc < %s -mtriple=i686-- -mattr=+sse2    | FileCheck %s --check-prefixes=SSE
+; RUN: llc < %s -mtriple=i686-- -mattr=+avx     | FileCheck %s --check-prefixes=AVX
+; RUN: llc < %s -mtriple=i686-- -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=AVX512
+
+define zeroext i8 @oeq_f64_i32(double %x) nounwind readnone {
+; SSE-LABEL: oeq_f64_i32:
+; SSE:       # %bb.0: # %entry
+; SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE-NEXT:    cvttpd2dq %xmm0, %xmm1
+; SSE-NEXT:    cvtdq2pd %xmm1, %xmm1
+; SSE-NEXT:    cmpeqsd %xmm0, %xmm1
+; SSE-NEXT:    movd %xmm1, %eax
+; SSE-NEXT:    andl $1, %eax
+; SSE-NEXT:    # kill: def $al killed $al killed $eax
+; SSE-NEXT:    retl
+;
+; AVX-LABEL: oeq_f64_i32:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT:    vcvttpd2dq %xmm0, %xmm1
+; AVX-NEXT:    vcvtdq2pd %xmm1, %xmm1
+; AVX-NEXT:    vcmpeqsd %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vmovd %xmm0, %eax
+; AVX-NEXT:    andl $1, %eax
+; AVX-NEXT:    # kill: def $al killed $al killed $eax
+; AVX-NEXT:    retl
+;
+; AVX512-LABEL: oeq_f64_i32:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX512-NEXT:    vcvttpd2dq %xmm0, %xmm1
+; AVX512-NEXT:    vcvtdq2pd %xmm1, %xmm1
+; AVX512-NEXT:    vcmpeqsd %xmm0, %xmm1, %k0
+; AVX512-NEXT:    kmovd %k0, %eax
+; AVX512-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512-NEXT:    retl
+entry:
+	%0 = fptosi double %x to i32
+	%1 = sitofp i32 %0 to double
+	%2 = fcmp oeq double %1, %x
+	%retval12 = zext i1 %2 to i8
+	ret i8 %retval12
+}
+
+define zeroext i8 @oeq_f64_u32(double %x) nounwind readnone {
+; SSE-LABEL: oeq_f64_u32:
+; SSE:       # %bb.0: # %entry
+; SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE-NEXT:    cvttsd2si %xmm0, %eax
+; SSE-NEXT:    movl %eax, %ecx
+; SSE-NEXT:    sarl $31, %ecx
+; SSE-NEXT:    movapd %xmm0, %xmm1
+; SSE-NEXT:    subsd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; SSE-NEXT:    cvttsd2si %xmm1, %edx
+; SSE-NEXT:    andl %ecx, %edx
+; SSE-NEXT:    orl %eax, %edx
+; SSE-NEXT:    movd %edx, %xmm1
+; SSE-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; SSE-NEXT:    subsd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; SSE-NEXT:    cmpeqsd %xmm0, %xmm1
+; SSE-NEXT:    movd %xmm1, %eax
+; SSE-NEXT:    andl $1, %eax
+; SSE-NEXT:    # kill: def $al killed $al killed $eax
+; SSE-NEXT:    retl
+;
+; AVX-LABEL: oeq_f64_u32:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT:    vcvttsd2si %xmm0, %eax
+; AVX-NEXT:    movl %eax, %ecx
+; AVX-NEXT:    sarl $31, %ecx
+; AVX-NEXT:    vsubsd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm1
+; AVX-NEXT:    vcvttsd2si %xmm1, %edx
+; AVX-NEXT:    andl %ecx, %edx
+; AVX-NEXT:    orl %eax, %edx
+; AVX-NEXT:    vmovd %edx, %xmm1
+; AVX-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1
+; AVX-NEXT:    vsubsd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1
+; AVX-NEXT:    vcmpeqsd %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vmovd %xmm0, %eax
+; AVX-NEXT:    andl $1, %eax
+; AVX-NEXT:    # kill: def $al killed $al killed $eax
+; AVX-NEXT:    retl
+;
+; AVX512-LABEL: oeq_f64_u32:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX512-NEXT:    vcvttsd2usi %xmm0, %eax
+; AVX512-NEXT:    vcvtusi2sd %eax, %xmm7, %xmm1
+; AVX512-NEXT:    vcmpeqsd %xmm0, %xmm1, %k0
+; AVX512-NEXT:    kmovd %k0, %eax
+; AVX512-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512-NEXT:    retl
+entry:
+	%0 = fptoui double %x to i32
+	%1 = uitofp i32 %0 to double
+	%2 = fcmp oeq double %1, %x
+	%retval12 = zext i1 %2 to i8
+	ret i8 %retval12
+}
+
+define zeroext i8 @oeq_f64_i64(double %x) nounwind readnone {
+; SSE-LABEL: oeq_f64_i64:
+; SSE:       # %bb.0: # %entry
+; SSE-NEXT:    pushl %ebp
+; SSE-NEXT:    movl %esp, %ebp
+; SSE-NEXT:    andl $-8, %esp
+; SSE-NEXT:    subl $32, %esp
+; SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE-NEXT:    movsd %xmm0, {{[0-9]+}}(%esp)
+; SSE-NEXT:    fldl {{[0-9]+}}(%esp)
+; SSE-NEXT:    fnstcw {{[0-9]+}}(%esp)
+; SSE-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; SSE-NEXT:    orl $3072, %eax # imm = 0xC00
+; SSE-NEXT:    movw %ax, {{[0-9]+}}(%esp)
+; SSE-NEXT:    fldcw {{[0-9]+}}(%esp)
+; SSE-NEXT:    fistpll {{[0-9]+}}(%esp)
+; SSE-NEXT:    fldcw {{[0-9]+}}(%esp)
+; SSE-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; SSE-NEXT:    movlps %xmm1, {{[0-9]+}}(%esp)
+; SSE-NEXT:    fildll {{[0-9]+}}(%esp)
+; SSE-NEXT:    fstpl {{[0-9]+}}(%esp)
+; SSE-NEXT:    cmpeqsd {{[0-9]+}}(%esp), %xmm0
+; SSE-NEXT:    movd %xmm0, %eax
+; SSE-NEXT:    andl $1, %eax
+; SSE-NEXT:    # kill: def $al killed $al killed $eax
+; SSE-NEXT:    movl %ebp, %esp
+; SSE-NEXT:    popl %ebp
+; SSE-NEXT:    retl
+;
+; AVX-LABEL: oeq_f64_i64:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    pushl %ebp
+; AVX-NEXT:    movl %esp, %ebp
+; AVX-NEXT:    andl $-8, %esp
+; AVX-NEXT:    subl $24, %esp
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT:    vmovsd %xmm0, (%esp)
+; AVX-NEXT:    fldl (%esp)
+; AVX-NEXT:    fisttpll (%esp)
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX-NEXT:    vmovlps %xmm1, {{[0-9]+}}(%esp)
+; AVX-NEXT:    fildll {{[0-9]+}}(%esp)
+; AVX-NEXT:    fstpl {{[0-9]+}}(%esp)
+; AVX-NEXT:    vcmpeqsd {{[0-9]+}}(%esp), %xmm0, %xmm0
+; AVX-NEXT:    vmovd %xmm0, %eax
+; AVX-NEXT:    andl $1, %eax
+; AVX-NEXT:    # kill: def $al killed $al killed $eax
+; AVX-NEXT:    movl %ebp, %esp
+; AVX-NEXT:    popl %ebp
+; AVX-NEXT:    retl
+;
+; AVX512-LABEL: oeq_f64_i64:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX512-NEXT:    vcvttpd2qq %xmm0, %xmm1
+; AVX512-NEXT:    vcvtqq2pd %ymm1, %ymm1
+; AVX512-NEXT:    vcmpeqsd %xmm0, %xmm1, %k0
+; AVX512-NEXT:    kmovd %k0, %eax
+; AVX512-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retl
+entry:
+	%0 = fptosi double %x to i64
+	%1 = sitofp i64 %0 to double
+	%2 = fcmp oeq double %1, %x
+	%retval12 = zext i1 %2 to i8
+	ret i8 %retval12
+}
+
+define zeroext i8 @oeq_f64_u64(double %x) nounwind readnone {
+; SSE-LABEL: oeq_f64_u64:
+; SSE:       # %bb.0: # %entry
+; SSE-NEXT:    pushl %ebp
+; SSE-NEXT:    movl %esp, %ebp
+; SSE-NEXT:    andl $-8, %esp
+; SSE-NEXT:    subl $16, %esp
+; SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE-NEXT:    movsd {{.*#+}} xmm1 = [9.2233720368547758E+18,0.0E+0]
+; SSE-NEXT:    ucomisd %xmm0, %xmm1
+; SSE-NEXT:    jbe .LBB3_2
+; SSE-NEXT:  # %bb.1: # %entry
+; SSE-NEXT:    xorpd %xmm1, %xmm1
+; SSE-NEXT:  .LBB3_2: # %entry
+; SSE-NEXT:    movapd %xmm0, %xmm2
+; SSE-NEXT:    subsd %xmm1, %xmm2
+; SSE-NEXT:    movsd %xmm2, {{[0-9]+}}(%esp)
+; SSE-NEXT:    setbe %al
+; SSE-NEXT:    fldl {{[0-9]+}}(%esp)
+; SSE-NEXT:    fnstcw {{[0-9]+}}(%esp)
+; SSE-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
+; SSE-NEXT:    orl $3072, %ecx # imm = 0xC00
+; SSE-NEXT:    movw %cx, {{[0-9]+}}(%esp)
+; SSE-NEXT:    fldcw {{[0-9]+}}(%esp)
+; SSE-NEXT:    fistpll {{[0-9]+}}(%esp)
+; SSE-NEXT:    fldcw {{[0-9]+}}(%esp)
+; SSE-NEXT:    movzbl %al, %eax
+; SSE-NEXT:    shll $31, %eax
+; SSE-NEXT:    xorl {{[0-9]+}}(%esp), %eax
+; SSE-NEXT:    movd %eax, %xmm1
+; SSE-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1]
+; SSE-NEXT:    subpd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
+; SSE-NEXT:    movapd %xmm2, %xmm1
+; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1]
+; SSE-NEXT:    addsd %xmm2, %xmm1
+; SSE-NEXT:    cmpeqsd %xmm0, %xmm1
+; SSE-NEXT:    movd %xmm1, %eax
+; SSE-NEXT:    andl $1, %eax
+; SSE-NEXT:    # kill: def $al killed $al killed $eax
+; SSE-NEXT:    movl %ebp, %esp
+; SSE-NEXT:    popl %ebp
+; SSE-NEXT:    retl
+;
+; AVX-LABEL: oeq_f64_u64:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    pushl %ebp
+; AVX-NEXT:    movl %esp, %ebp
+; AVX-NEXT:    andl $-8, %esp
+; AVX-NEXT:    subl $8, %esp
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = [9.2233720368547758E+18,0.0E+0]
+; AVX-NEXT:    vucomisd %xmm0, %xmm1
+; AVX-NEXT:    jbe .LBB3_2
+; AVX-NEXT:  # %bb.1: # %entry
+; AVX-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; AVX-NEXT:  .LBB3_2: # %entry
+; AVX-NEXT:    vsubsd %xmm1, %xmm0, %xmm1
+; AVX-NEXT:    vmovsd %xmm1, (%esp)
+; AVX-NEXT:    fldl (%esp)
+; AVX-NEXT:    fisttpll (%esp)
+; AVX-NEXT:    setbe %al
+; AVX-NEXT:    movzbl %al, %eax
+; AVX-NEXT:    shll $31, %eax
+; AVX-NEXT:    xorl {{[0-9]+}}(%esp), %eax
+; AVX-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX-NEXT:    vpinsrd $1, %eax, %xmm1, %xmm1
+; AVX-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
+; AVX-NEXT:    vsubpd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1
+; AVX-NEXT:    vshufpd {{.*#+}} xmm2 = xmm1[1,0]
+; AVX-NEXT:    vaddsd %xmm1, %xmm2, %xmm1
+; AVX-NEXT:    vcmpeqsd %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vmovd %xmm0, %eax
+; AVX-NEXT:    andl $1, %eax
+; AVX-NEXT:    # kill: def $al killed $al killed $eax
+; AVX-NEXT:    movl %ebp, %esp
+; AVX-NEXT:    popl %ebp
+; AVX-NEXT:    retl
+;
+; AVX512-LABEL: oeq_f64_u64:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX512-NEXT:    vcvttpd2uqq %xmm0, %xmm1
+; AVX512-NEXT:    vcvtuqq2pd %ymm1, %ymm1
+; AVX512-NEXT:    vcmpeqsd %xmm0, %xmm1, %k0
+; AVX512-NEXT:    kmovd %k0, %eax
+; AVX512-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retl
+entry:
+	%0 = fptoui double %x to i64
+	%1 = uitofp i64 %0 to double
+	%2 = fcmp oeq double %1, %x
+	%retval12 = zext i1 %2 to i8
+	ret i8 %retval12
+}
+
+define zeroext i8 @une_f64_i32(double %x) nounwind readnone {
+; SSE-LABEL: une_f64_i32:
+; SSE:       # %bb.0: # %entry
+; SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE-NEXT:    cvttpd2dq %xmm0, %xmm1
+; SSE-NEXT:    cvtdq2pd %xmm1, %xmm1
+; SSE-NEXT:    cmpneqsd %xmm0, %xmm1
+; SSE-NEXT:    movd %xmm1, %eax
+; SSE-NEXT:    andl $1, %eax
+; SSE-NEXT:    # kill: def $al killed $al killed $eax
+; SSE-NEXT:    retl
+;
+; AVX-LABEL: une_f64_i32:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT:    vcvttpd2dq %xmm0, %xmm1
+; AVX-NEXT:    vcvtdq2pd %xmm1, %xmm1
+; AVX-NEXT:    vcmpneqsd %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vmovd %xmm0, %eax
+; AVX-NEXT:    andl $1, %eax
+; AVX-NEXT:    # kill: def $al killed $al killed $eax
+; AVX-NEXT:    retl
+;
+; AVX512-LABEL: une_f64_i32:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX512-NEXT:    vcvttpd2dq %xmm0, %xmm1
+; AVX512-NEXT:    vcvtdq2pd %xmm1, %xmm1
+; AVX512-NEXT:    vcmpneqsd %xmm0, %xmm1, %k0
+; AVX512-NEXT:    kmovd %k0, %eax
+; AVX512-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512-NEXT:    retl
+entry:
+	%0 = fptosi double %x to i32
+	%1 = sitofp i32 %0 to double
+	%2 = fcmp une double %1, %x
+	%retval12 = zext i1 %2 to i8
+	ret i8 %retval12
+}
+
+define zeroext i8 @une_f64_u32(double %x) nounwind readnone {
+; SSE-LABEL: une_f64_u32:
+; SSE:       # %bb.0: # %entry
+; SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE-NEXT:    cvttsd2si %xmm0, %eax
+; SSE-NEXT:    movl %eax, %ecx
+; SSE-NEXT:    sarl $31, %ecx
+; SSE-NEXT:    movapd %xmm0, %xmm1
+; SSE-NEXT:    subsd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; SSE-NEXT:    cvttsd2si %xmm1, %edx
+; SSE-NEXT:    andl %ecx, %edx
+; SSE-NEXT:    orl %eax, %edx
+; SSE-NEXT:    movd %edx, %xmm1
+; SSE-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; SSE-NEXT:    subsd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; SSE-NEXT:    cmpneqsd %xmm0, %xmm1
+; SSE-NEXT:    movd %xmm1, %eax
+; SSE-NEXT:    andl $1, %eax
+; SSE-NEXT:    # kill: def $al killed $al killed $eax
+; SSE-NEXT:    retl
+;
+; AVX-LABEL: une_f64_u32:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT:    vcvttsd2si %xmm0, %eax
+; AVX-NEXT:    movl %eax, %ecx
+; AVX-NEXT:    sarl $31, %ecx
+; AVX-NEXT:    vsubsd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm1
+; AVX-NEXT:    vcvttsd2si %xmm1, %edx
+; AVX-NEXT:    andl %ecx, %edx
+; AVX-NEXT:    orl %eax, %edx
+; AVX-NEXT:    vmovd %edx, %xmm1
+; AVX-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1
+; AVX-NEXT:    vsubsd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1
+; AVX-NEXT:    vcmpneqsd %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vmovd %xmm0, %eax
+; AVX-NEXT:    andl $1, %eax
+; AVX-NEXT:    # kill: def $al killed $al killed $eax
+; AVX-NEXT:    retl
+;
+; AVX512-LABEL: une_f64_u32:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX512-NEXT:    vcvttsd2usi %xmm0, %eax
+; AVX512-NEXT:    vcvtusi2sd %eax, %xmm7, %xmm1
+; AVX512-NEXT:    vcmpneqsd %xmm0, %xmm1, %k0
+; AVX512-NEXT:    kmovd %k0, %eax
+; AVX512-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512-NEXT:    retl
+entry:
+	%0 = fptoui double %x to i32
+	%1 = uitofp i32 %0 to double
+	%2 = fcmp une double %1, %x
+	%retval12 = zext i1 %2 to i8
+	ret i8 %retval12
+}
+
+define zeroext i8 @une_f64_i64(double %x) nounwind readnone {
+; SSE-LABEL: une_f64_i64:
+; SSE:       # %bb.0: # %entry
+; SSE-NEXT:    pushl %ebp
+; SSE-NEXT:    movl %esp, %ebp
+; SSE-NEXT:    andl $-8, %esp
+; SSE-NEXT:    subl $32, %esp
+; SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE-NEXT:    movsd %xmm0, {{[0-9]+}}(%esp)
+; SSE-NEXT:    fldl {{[0-9]+}}(%esp)
+; SSE-NEXT:    fnstcw {{[0-9]+}}(%esp)
+; SSE-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; SSE-NEXT:    orl $3072, %eax # imm = 0xC00
+; SSE-NEXT:    movw %ax, {{[0-9]+}}(%esp)
+; SSE-NEXT:    fldcw {{[0-9]+}}(%esp)
+; SSE-NEXT:    fistpll {{[0-9]+}}(%esp)
+; SSE-NEXT:    fldcw {{[0-9]+}}(%esp)
+; SSE-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; SSE-NEXT:    movlps %xmm1, {{[0-9]+}}(%esp)
+; SSE-NEXT:    fildll {{[0-9]+}}(%esp)
+; SSE-NEXT:    fstpl {{[0-9]+}}(%esp)
+; SSE-NEXT:    cmpneqsd {{[0-9]+}}(%esp), %xmm0
+; SSE-NEXT:    movd %xmm0, %eax
+; SSE-NEXT:    andl $1, %eax
+; SSE-NEXT:    # kill: def $al killed $al killed $eax
+; SSE-NEXT:    movl %ebp, %esp
+; SSE-NEXT:    popl %ebp
+; SSE-NEXT:    retl
+;
+; AVX-LABEL: une_f64_i64:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    pushl %ebp
+; AVX-NEXT:    movl %esp, %ebp
+; AVX-NEXT:    andl $-8, %esp
+; AVX-NEXT:    subl $24, %esp
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT:    vmovsd %xmm0, (%esp)
+; AVX-NEXT:    fldl (%esp)
+; AVX-NEXT:    fisttpll (%esp)
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX-NEXT:    vmovlps %xmm1, {{[0-9]+}}(%esp)
+; AVX-NEXT:    fildll {{[0-9]+}}(%esp)
+; AVX-NEXT:    fstpl {{[0-9]+}}(%esp)
+; AVX-NEXT:    vcmpneqsd {{[0-9]+}}(%esp), %xmm0, %xmm0
+; AVX-NEXT:    vmovd %xmm0, %eax
+; AVX-NEXT:    andl $1, %eax
+; AVX-NEXT:    # kill: def $al killed $al killed $eax
+; AVX-NEXT:    movl %ebp, %esp
+; AVX-NEXT:    popl %ebp
+; AVX-NEXT:    retl
+;
+; AVX512-LABEL: une_f64_i64:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX512-NEXT:    vcvttpd2qq %xmm0, %xmm1
+; AVX512-NEXT:    vcvtqq2pd %ymm1, %ymm1
+; AVX512-NEXT:    vcmpneqsd %xmm0, %xmm1, %k0
+; AVX512-NEXT:    kmovd %k0, %eax
+; AVX512-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retl
+entry:
+	%0 = fptosi double %x to i64
+	%1 = sitofp i64 %0 to double
+	%2 = fcmp une double %1, %x
+	%retval12 = zext i1 %2 to i8
+	ret i8 %retval12
+}
+
+define zeroext i8 @une_f64_u64(double %x) nounwind readnone {
+; SSE-LABEL: une_f64_u64:
+; SSE:       # %bb.0: # %entry
+; SSE-NEXT:    pushl %ebp
+; SSE-NEXT:    movl %esp, %ebp
+; SSE-NEXT:    andl $-8, %esp
+; SSE-NEXT:    subl $16, %esp
+; SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE-NEXT:    movsd {{.*#+}} xmm1 = [9.2233720368547758E+18,0.0E+0]
+; SSE-NEXT:    ucomisd %xmm0, %xmm1
+; SSE-NEXT:    jbe .LBB7_2
+; SSE-NEXT:  # %bb.1: # %entry
+; SSE-NEXT:    xorpd %xmm1, %xmm1
+; SSE-NEXT:  .LBB7_2: # %entry
+; SSE-NEXT:    movapd %xmm0, %xmm2
+; SSE-NEXT:    subsd %xmm1, %xmm2
+; SSE-NEXT:    movsd %xmm2, {{[0-9]+}}(%esp)
+; SSE-NEXT:    setbe %al
+; SSE-NEXT:    fldl {{[0-9]+}}(%esp)
+; SSE-NEXT:    fnstcw {{[0-9]+}}(%esp)
+; SSE-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
+; SSE-NEXT:    orl $3072, %ecx # imm = 0xC00
+; SSE-NEXT:    movw %cx, {{[0-9]+}}(%esp)
+; SSE-NEXT:    fldcw {{[0-9]+}}(%esp)
+; SSE-NEXT:    fistpll {{[0-9]+}}(%esp)
+; SSE-NEXT:    fldcw {{[0-9]+}}(%esp)
+; SSE-NEXT:    movzbl %al, %eax
+; SSE-NEXT:    shll $31, %eax
+; SSE-NEXT:    xorl {{[0-9]+}}(%esp), %eax
+; SSE-NEXT:    movd %eax, %xmm1
+; SSE-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1]
+; SSE-NEXT:    subpd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
+; SSE-NEXT:    movapd %xmm2, %xmm1
+; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1]
+; SSE-NEXT:    addsd %xmm2, %xmm1
+; SSE-NEXT:    cmpneqsd %xmm0, %xmm1
+; SSE-NEXT:    movd %xmm1, %eax
+; SSE-NEXT:    andl $1, %eax
+; SSE-NEXT:    # kill: def $al killed $al killed $eax
+; SSE-NEXT:    movl %ebp, %esp
+; SSE-NEXT:    popl %ebp
+; SSE-NEXT:    retl
+;
+; AVX-LABEL: une_f64_u64:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    pushl %ebp
+; AVX-NEXT:    movl %esp, %ebp
+; AVX-NEXT:    andl $-8, %esp
+; AVX-NEXT:    subl $8, %esp
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = [9.2233720368547758E+18,0.0E+0]
+; AVX-NEXT:    vucomisd %xmm0, %xmm1
+; AVX-NEXT:    jbe .LBB7_2
+; AVX-NEXT:  # %bb.1: # %entry
+; AVX-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; AVX-NEXT:  .LBB7_2: # %entry
+; AVX-NEXT:    vsubsd %xmm1, %xmm0, %xmm1
+; AVX-NEXT:    vmovsd %xmm1, (%esp)
+; AVX-NEXT:    fldl (%esp)
+; AVX-NEXT:    fisttpll (%esp)
+; AVX-NEXT:    setbe %al
+; AVX-NEXT:    movzbl %al, %eax
+; AVX-NEXT:    shll $31, %eax
+; AVX-NEXT:    xorl {{[0-9]+}}(%esp), %eax
+; AVX-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX-NEXT:    vpinsrd $1, %eax, %xmm1, %xmm1
+; AVX-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
+; AVX-NEXT:    vsubpd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1
+; AVX-NEXT:    vshufpd {{.*#+}} xmm2 = xmm1[1,0]
+; AVX-NEXT:    vaddsd %xmm1, %xmm2, %xmm1
+; AVX-NEXT:    vcmpneqsd %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vmovd %xmm0, %eax
+; AVX-NEXT:    andl $1, %eax
+; AVX-NEXT:    # kill: def $al killed $al killed $eax
+; AVX-NEXT:    movl %ebp, %esp
+; AVX-NEXT:    popl %ebp
+; AVX-NEXT:    retl
+;
+; AVX512-LABEL: une_f64_u64:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX512-NEXT:    vcvttpd2uqq %xmm0, %xmm1
+; AVX512-NEXT:    vcvtuqq2pd %ymm1, %ymm1
+; AVX512-NEXT:    vcmpneqsd %xmm0, %xmm1, %k0
+; AVX512-NEXT:    kmovd %k0, %eax
+; AVX512-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retl
+entry:
+	%0 = fptoui double %x to i64
+	%1 = uitofp i64 %0 to double
+	%2 = fcmp une double %1, %x
+	%retval12 = zext i1 %2 to i8
 	ret i8 %retval12
 }
diff --git a/llvm/test/CodeGen/X86/shift-i512.ll b/llvm/test/CodeGen/X86/shift-i512.ll
index 756019d..03b61d9 100644
--- a/llvm/test/CodeGen/X86/shift-i512.ll
+++ b/llvm/test/CodeGen/X86/shift-i512.ll
@@ -10,7 +10,7 @@ define <8 x i64> @shl_i512_1(<8 x i64> %a)  {
 ; AVX512VL:       # %bb.0:
 ; AVX512VL-NEXT:    valignq {{.*#+}} zmm1 = zmm0[3,4,5,6,7,0,1,2]
 ; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm2
-; AVX512VL-NEXT:    vpsllq $1, %xmm0, %xmm3
+; AVX512VL-NEXT:    vpaddq %xmm0, %xmm0, %xmm3
 ; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3]
 ; AVX512VL-NEXT:    vpsrlq $63, %xmm4, %xmm4
 ; AVX512VL-NEXT:    vpaddq %xmm2, %xmm2, %xmm2
@@ -34,7 +34,7 @@ define <8 x i64> @shl_i512_1(<8 x i64> %a)  {
 ; AVX512VBMI-NEXT:    vextracti128 $1, %ymm0, %xmm2
 ; AVX512VBMI-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
 ; AVX512VBMI-NEXT:    vpshldq $1, %xmm3, %xmm2, %xmm3
-; AVX512VBMI-NEXT:    vpsllq $1, %xmm0, %xmm4
+; AVX512VBMI-NEXT:    vpaddq %xmm0, %xmm0, %xmm4
 ; AVX512VBMI-NEXT:    vinserti128 $1, %xmm3, %ymm4, %ymm3
 ; AVX512VBMI-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
 ; AVX512VBMI-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7]
@@ -51,7 +51,7 @@ define <8 x i64> @shl_i512_1(<8 x i64> %a)  {
 ; ZNVER4-NEXT:    vextracti32x4 $2, %zmm0, %xmm1
 ; ZNVER4-NEXT:    vextracti128 $1, %ymm0, %xmm2
 ; ZNVER4-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
-; ZNVER4-NEXT:    vpsllq $1, %xmm0, %xmm4
+; ZNVER4-NEXT:    vpaddq %xmm0, %xmm0, %xmm4
 ; ZNVER4-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
 ; ZNVER4-NEXT:    vpshldq $1, %xmm3, %xmm2, %xmm3
 ; ZNVER4-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
diff --git a/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll
index 3f48b22..a48be03 100644
--- a/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll
+++ b/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll
@@ -5791,20 +5791,20 @@ declare <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64>, <2 x i64>) nounwind readnone
 define <2 x i64> @test_mm_slli_epi16(<2 x i64> %a0) {
 ; SSE-LABEL: test_mm_slli_epi16:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    psllw $1, %xmm0 # encoding: [0x66,0x0f,0x71,0xf0,0x01]
+; SSE-NEXT:    psllw $2, %xmm0 # encoding: [0x66,0x0f,0x71,0xf0,0x02]
 ; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
 ;
 ; AVX1-LABEL: test_mm_slli_epi16:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpsllw $1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0x71,0xf0,0x01]
+; AVX1-NEXT:    vpsllw $2, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0x71,0xf0,0x02]
 ; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
 ;
 ; AVX512-LABEL: test_mm_slli_epi16:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpsllw $1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x71,0xf0,0x01]
+; AVX512-NEXT:    vpsllw $2, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x71,0xf0,0x02]
 ; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
-  %res = call <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16> %arg0, i32 1)
+  %res = call <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16> %arg0, i32 2)
   %bc = bitcast <8 x i16> %res to <2 x i64>
   ret <2 x i64> %bc
 }
@@ -5813,20 +5813,20 @@ declare <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16>, i32) nounwind readnone
 define <2 x i64> @test_mm_slli_epi32(<2 x i64> %a0) {
 ; SSE-LABEL: test_mm_slli_epi32:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    pslld $1, %xmm0 # encoding: [0x66,0x0f,0x72,0xf0,0x01]
+; SSE-NEXT:    pslld $2, %xmm0 # encoding: [0x66,0x0f,0x72,0xf0,0x02]
 ; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
 ;
 ; AVX1-LABEL: test_mm_slli_epi32:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpslld $1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0x72,0xf0,0x01]
+; AVX1-NEXT:    vpslld $2, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0x72,0xf0,0x02]
 ; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
 ;
 ; AVX512-LABEL: test_mm_slli_epi32:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpslld $1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x72,0xf0,0x01]
+; AVX512-NEXT:    vpslld $2, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x72,0xf0,0x02]
 ; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
-  %res = call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> %arg0, i32 1)
+  %res = call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> %arg0, i32 2)
   %bc = bitcast <4 x i32> %res to <2 x i64>
   ret <2 x i64> %bc
 }
@@ -5835,19 +5835,19 @@ declare <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32>, i32) nounwind readnone
 define <2 x i64> @test_mm_slli_epi64(<2 x i64> %a0) {
 ; SSE-LABEL: test_mm_slli_epi64:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    psllq $1, %xmm0 # encoding: [0x66,0x0f,0x73,0xf0,0x01]
+; SSE-NEXT:    psllq $2, %xmm0 # encoding: [0x66,0x0f,0x73,0xf0,0x02]
 ; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
 ;
 ; AVX1-LABEL: test_mm_slli_epi64:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpsllq $1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0x73,0xf0,0x01]
+; AVX1-NEXT:    vpsllq $2, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0x73,0xf0,0x02]
 ; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
 ;
 ; AVX512-LABEL: test_mm_slli_epi64:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpsllq $1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x73,0xf0,0x01]
+; AVX512-NEXT:    vpsllq $2, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x73,0xf0,0x02]
 ; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
-  %res = call <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64> %a0, i32 1)
+  %res = call <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64> %a0, i32 2)
   ret <2 x i64> %res
 }
 declare <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64>, i32) nounwind readnone
diff --git a/llvm/test/CodeGen/X86/vec_shift6.ll b/llvm/test/CodeGen/X86/vec_shift6.ll
index 71e659c..219e32c 100644
--- a/llvm/test/CodeGen/X86/vec_shift6.ll
+++ b/llvm/test/CodeGen/X86/vec_shift6.ll
@@ -28,14 +28,14 @@ define <8 x i16> @test2(<8 x i16> %a) {
 ; SSE2-LABEL: test2:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
-; SSE2-NEXT:    paddw %xmm0, %xmm1
+; SSE2-NEXT:    paddw %xmm1, %xmm1
 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: test2:
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    movdqa %xmm0, %xmm1
-; SSE41-NEXT:    paddw %xmm0, %xmm1
+; SSE41-NEXT:    paddw %xmm1, %xmm1
 ; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
 ; SSE41-NEXT:    retq
 ;
@@ -56,7 +56,7 @@ define <4 x i32> @test3(<4 x i32> %a) {
 ; SSE2-LABEL: test3:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
-; SSE2-NEXT:    paddd %xmm0, %xmm1
+; SSE2-NEXT:    paddd %xmm1, %xmm1
 ; SSE2-NEXT:    pslld $2, %xmm0
 ; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
 ; SSE2-NEXT:    retq
@@ -81,14 +81,14 @@ define <4 x i32> @test4(<4 x i32> %a) {
 ; SSE2-LABEL: test4:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
-; SSE2-NEXT:    paddd %xmm0, %xmm1
+; SSE2-NEXT:    paddd %xmm1, %xmm1
 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: test4:
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    movdqa %xmm0, %xmm1
-; SSE41-NEXT:    paddd %xmm0, %xmm1
+; SSE41-NEXT:    paddd %xmm1, %xmm1
 ; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
 ; SSE41-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/vec_unsafe-fp-math.ll b/llvm/test/CodeGen/X86/vec_unsafe-fp-math.ll
index 23d22e7..3f92d2b 100644
--- a/llvm/test/CodeGen/X86/vec_unsafe-fp-math.ll
+++ b/llvm/test/CodeGen/X86/vec_unsafe-fp-math.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -enable-unsafe-fp-math -enable-no-signed-zeros-fp-math -mtriple=x86_64-unknown-unknown | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s
 
 ; Make sure that vectors get the same benefits as scalars when using unsafe-fp-math.
 
@@ -18,7 +18,7 @@ define <4 x float> @vec_fneg(<4 x float> %x) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    xorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-NEXT:    retq
-  %sub = fsub <4 x float> zeroinitializer, %x
+  %sub = fsub nsz <4 x float> zeroinitializer, %x
   ret <4 x float> %sub
 }
 
diff --git a/llvm/test/CodeGen/X86/vector-gep.ll b/llvm/test/CodeGen/X86/vector-gep.ll
index 5c48559..b4cffcd 100644
--- a/llvm/test/CodeGen/X86/vector-gep.ll
+++ b/llvm/test/CodeGen/X86/vector-gep.ll
@@ -122,91 +122,87 @@ define <64 x ptr> @AGEP9(ptr %param, <64 x i32> %off) nounwind {
 ; CHECK-NEXT:    movl %esp, %ebp
 ; CHECK-NEXT:    andl $-32, %esp
 ; CHECK-NEXT:    subl $160, %esp
-; CHECK-NEXT:    vmovdqa %ymm2, %ymm5
-; CHECK-NEXT:    vmovdqa %ymm1, %ymm3
-; CHECK-NEXT:    vmovdqa %ymm0, %ymm1
-; CHECK-NEXT:    vmovdqa 72(%ebp), %ymm0
-; CHECK-NEXT:    vmovdqa 40(%ebp), %ymm2
-; CHECK-NEXT:    vpaddd %xmm2, %xmm2, %xmm4
-; CHECK-NEXT:    vbroadcastss 12(%ebp), %xmm7
-; CHECK-NEXT:    vpaddd %xmm4, %xmm7, %xmm4
-; CHECK-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    vextractf128 $1, %ymm2, %xmm2
-; CHECK-NEXT:    vpaddd %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpaddd %xmm2, %xmm7, %xmm2
-; CHECK-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    vpaddd %xmm0, %xmm0, %xmm2
-; CHECK-NEXT:    vpaddd %xmm2, %xmm7, %xmm2
-; CHECK-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    vpaddd %xmm0, %xmm0, %xmm3
+; CHECK-NEXT:    vbroadcastss 12(%ebp), %xmm5
+; CHECK-NEXT:    vpaddd %xmm3, %xmm5, %xmm3
+; CHECK-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
 ; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
 ; CHECK-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
-; CHECK-NEXT:    vpaddd %xmm0, %xmm7, %xmm0
+; CHECK-NEXT:    vpaddd %xmm0, %xmm5, %xmm0
 ; CHECK-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    vmovdqa 104(%ebp), %ymm0
-; CHECK-NEXT:    vpaddd %xmm0, %xmm0, %xmm2
-; CHECK-NEXT:    vpaddd %xmm2, %xmm7, %xmm2
-; CHECK-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; CHECK-NEXT:    vpaddd %xmm1, %xmm1, %xmm0
+; CHECK-NEXT:    vpaddd %xmm0, %xmm5, %xmm0
+; CHECK-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    vextractf128 $1, %ymm1, %xmm0
 ; CHECK-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
-; CHECK-NEXT:    vpaddd %xmm0, %xmm7, %xmm0
+; CHECK-NEXT:    vpaddd %xmm0, %xmm5, %xmm0
 ; CHECK-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    vmovdqa 136(%ebp), %ymm0
-; CHECK-NEXT:    vpaddd %xmm0, %xmm0, %xmm2
-; CHECK-NEXT:    vpaddd %xmm2, %xmm7, %xmm2
-; CHECK-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; CHECK-NEXT:    vpaddd %xmm2, %xmm2, %xmm0
+; CHECK-NEXT:    vpaddd %xmm0, %xmm5, %xmm0
+; CHECK-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    vextractf128 $1, %ymm2, %xmm0
 ; CHECK-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
-; CHECK-NEXT:    vpaddd %xmm0, %xmm7, %xmm0
+; CHECK-NEXT:    vpaddd %xmm0, %xmm5, %xmm0
 ; CHECK-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    vmovdqa 168(%ebp), %ymm0
-; CHECK-NEXT:    vpaddd %xmm0, %xmm0, %xmm2
-; CHECK-NEXT:    vpaddd %xmm2, %xmm7, %xmm2
-; CHECK-NEXT:    vmovdqa %xmm2, (%esp) # 16-byte Spill
-; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; CHECK-NEXT:    vmovdqa 40(%ebp), %xmm0
 ; CHECK-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
-; CHECK-NEXT:    vpaddd %xmm0, %xmm7, %xmm2
-; CHECK-NEXT:    vpaddd %xmm1, %xmm1, %xmm0
-; CHECK-NEXT:    vpaddd %xmm0, %xmm7, %xmm0
-; CHECK-NEXT:    vextractf128 $1, %ymm1, %xmm1
-; CHECK-NEXT:    vpaddd %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpaddd %xmm1, %xmm7, %xmm1
-; CHECK-NEXT:    vpaddd %xmm3, %xmm3, %xmm6
-; CHECK-NEXT:    vpaddd %xmm6, %xmm7, %xmm6
-; CHECK-NEXT:    vextractf128 $1, %ymm3, %xmm3
-; CHECK-NEXT:    vpaddd %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpaddd %xmm3, %xmm7, %xmm3
-; CHECK-NEXT:    vmovdqa %ymm5, %ymm4
-; CHECK-NEXT:    vpaddd %xmm4, %xmm4, %xmm5
-; CHECK-NEXT:    vpaddd %xmm5, %xmm7, %xmm5
-; CHECK-NEXT:    vextractf128 $1, %ymm4, %xmm4
+; CHECK-NEXT:    vpaddd %xmm0, %xmm5, %xmm0
+; CHECK-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    vmovdqa 56(%ebp), %xmm0
+; CHECK-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
+; CHECK-NEXT:    vpaddd %xmm0, %xmm5, %xmm0
+; CHECK-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    vmovdqa 72(%ebp), %xmm0
+; CHECK-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
+; CHECK-NEXT:    vpaddd %xmm0, %xmm5, %xmm0
+; CHECK-NEXT:    vmovdqa %xmm0, (%esp) # 16-byte Spill
+; CHECK-NEXT:    vmovdqa 88(%ebp), %xmm0
+; CHECK-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
+; CHECK-NEXT:    vpaddd %xmm0, %xmm5, %xmm2
+; CHECK-NEXT:    vmovdqa 104(%ebp), %xmm0
+; CHECK-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
+; CHECK-NEXT:    vpaddd %xmm0, %xmm5, %xmm1
+; CHECK-NEXT:    vmovdqa 120(%ebp), %xmm0
+; CHECK-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
+; CHECK-NEXT:    vpaddd %xmm0, %xmm5, %xmm0
+; CHECK-NEXT:    vmovdqa 136(%ebp), %xmm6
+; CHECK-NEXT:    vpaddd %xmm6, %xmm6, %xmm6
+; CHECK-NEXT:    vpaddd %xmm6, %xmm5, %xmm6
+; CHECK-NEXT:    vmovdqa 152(%ebp), %xmm7
+; CHECK-NEXT:    vpaddd %xmm7, %xmm7, %xmm7
+; CHECK-NEXT:    vpaddd %xmm7, %xmm5, %xmm7
+; CHECK-NEXT:    vmovdqa 168(%ebp), %xmm4
 ; CHECK-NEXT:    vpaddd %xmm4, %xmm4, %xmm4
-; CHECK-NEXT:    vpaddd %xmm4, %xmm7, %xmm4
+; CHECK-NEXT:    vpaddd %xmm4, %xmm5, %xmm4
+; CHECK-NEXT:    vmovdqa 184(%ebp), %xmm3
+; CHECK-NEXT:    vpaddd %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vpaddd %xmm3, %xmm5, %xmm3
 ; CHECK-NEXT:    movl 8(%ebp), %eax
-; CHECK-NEXT:    vmovdqa %xmm4, 80(%eax)
-; CHECK-NEXT:    vmovdqa %xmm5, 64(%eax)
-; CHECK-NEXT:    vmovdqa %xmm3, 48(%eax)
-; CHECK-NEXT:    vmovdqa %xmm6, 32(%eax)
-; CHECK-NEXT:    vmovdqa %xmm1, 16(%eax)
-; CHECK-NEXT:    vmovdqa %xmm0, (%eax)
-; CHECK-NEXT:    vmovdqa %xmm2, 240(%eax)
+; CHECK-NEXT:    vmovdqa %xmm3, 240(%eax)
+; CHECK-NEXT:    vmovdqa %xmm4, 224(%eax)
+; CHECK-NEXT:    vmovdqa %xmm7, 208(%eax)
+; CHECK-NEXT:    vmovdqa %xmm6, 192(%eax)
+; CHECK-NEXT:    vmovdqa %xmm0, 176(%eax)
+; CHECK-NEXT:    vmovdqa %xmm1, 160(%eax)
+; CHECK-NEXT:    vmovdqa %xmm2, 144(%eax)
 ; CHECK-NEXT:    vmovaps (%esp), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    vmovaps %xmm0, 224(%eax)
+; CHECK-NEXT:    vmovaps %xmm0, 128(%eax)
 ; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    vmovaps %xmm0, 208(%eax)
+; CHECK-NEXT:    vmovaps %xmm0, 112(%eax)
 ; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    vmovaps %xmm0, 192(%eax)
+; CHECK-NEXT:    vmovaps %xmm0, 96(%eax)
 ; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    vmovaps %xmm0, 176(%eax)
+; CHECK-NEXT:    vmovaps %xmm0, 80(%eax)
 ; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    vmovaps %xmm0, 160(%eax)
+; CHECK-NEXT:    vmovaps %xmm0, 64(%eax)
 ; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    vmovaps %xmm0, 144(%eax)
+; CHECK-NEXT:    vmovaps %xmm0, 48(%eax)
 ; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    vmovaps %xmm0, 128(%eax)
+; CHECK-NEXT:    vmovaps %xmm0, 32(%eax)
 ; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    vmovaps %xmm0, 112(%eax)
+; CHECK-NEXT:    vmovaps %xmm0, 16(%eax)
 ; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    vmovaps %xmm0, 96(%eax)
+; CHECK-NEXT:    vmovaps %xmm0, (%eax)
 ; CHECK-NEXT:    movl %ebp, %esp
 ; CHECK-NEXT:    popl %ebp
 ; CHECK-NEXT:    vzeroupper
diff --git a/llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll b/llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll
index 13f7d68..33d80f6 100644
--- a/llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll
+++ b/llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll
@@ -652,7 +652,7 @@ define <16 x i8> @test_rem7_16i8(<16 x i8> %a) nounwind {
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE2-NEXT:    paddb %xmm4, %xmm1
 ; SSE2-NEXT:    movdqa %xmm1, %xmm2
-; SSE2-NEXT:    psllw $1, %xmm2
+; SSE2-NEXT:    paddw %xmm2, %xmm2
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
 ; SSE2-NEXT:    psrlw $2, %xmm1
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
@@ -678,7 +678,7 @@ define <16 x i8> @test_rem7_16i8(<16 x i8> %a) nounwind {
 ; SSE41-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE41-NEXT:    paddb %xmm3, %xmm1
 ; SSE41-NEXT:    movdqa %xmm1, %xmm2
-; SSE41-NEXT:    psllw $1, %xmm2
+; SSE41-NEXT:    paddw %xmm2, %xmm2
 ; SSE41-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
 ; SSE41-NEXT:    psrlw $2, %xmm1
 ; SSE41-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
@@ -701,7 +701,7 @@ define <16 x i8> @test_rem7_16i8(<16 x i8> %a) nounwind {
 ; AVX1-NEXT:    vpsrlw $1, %xmm2, %xmm2
 ; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
 ; AVX1-NEXT:    vpaddb %xmm1, %xmm2, %xmm1
-; AVX1-NEXT:    vpsllw $1, %xmm1, %xmm2
+; AVX1-NEXT:    vpaddw %xmm1, %xmm1, %xmm2
 ; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
 ; AVX1-NEXT:    vpsrlw $2, %xmm1, %xmm1
 ; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
@@ -720,7 +720,7 @@ define <16 x i8> @test_rem7_16i8(<16 x i8> %a) nounwind {
 ; AVX2NOBW-NEXT:    vpsrlw $1, %xmm2, %xmm2
 ; AVX2NOBW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
 ; AVX2NOBW-NEXT:    vpaddb %xmm1, %xmm2, %xmm1
-; AVX2NOBW-NEXT:    vpsllw $1, %xmm1, %xmm2
+; AVX2NOBW-NEXT:    vpaddw %xmm1, %xmm1, %xmm2
 ; AVX2NOBW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
 ; AVX2NOBW-NEXT:    vpsrlw $2, %xmm1, %xmm1
 ; AVX2NOBW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
@@ -739,7 +739,7 @@ define <16 x i8> @test_rem7_16i8(<16 x i8> %a) nounwind {
 ; AVX512BW-NEXT:    vpsrlw $1, %xmm2, %xmm2
 ; AVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
 ; AVX512BW-NEXT:    vpaddb %xmm1, %xmm2, %xmm1
-; AVX512BW-NEXT:    vpsllw $1, %xmm1, %xmm2
+; AVX512BW-NEXT:    vpaddw %xmm1, %xmm1, %xmm2
 ; AVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
 ; AVX512BW-NEXT:    vpsrlw $2, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
diff --git a/llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll b/llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll
index 1a5c373..e43108f 100644
--- a/llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll
+++ b/llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll
@@ -590,7 +590,7 @@ define <32 x i8> @test_rem7_32i8(<32 x i8> %a) nounwind {
 ; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
 ; AVX1-NEXT:    vpand %xmm6, %xmm5, %xmm5
 ; AVX1-NEXT:    vpaddb %xmm3, %xmm5, %xmm3
-; AVX1-NEXT:    vpsllw $1, %xmm3, %xmm5
+; AVX1-NEXT:    vpaddw %xmm3, %xmm3, %xmm5
 ; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm7 = [248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248]
 ; AVX1-NEXT:    vpand %xmm7, %xmm5, %xmm5
 ; AVX1-NEXT:    vpsrlw $2, %xmm3, %xmm3
@@ -609,7 +609,7 @@ define <32 x i8> @test_rem7_32i8(<32 x i8> %a) nounwind {
 ; AVX1-NEXT:    vpsrlw $1, %xmm3, %xmm3
 ; AVX1-NEXT:    vpand %xmm6, %xmm3, %xmm3
 ; AVX1-NEXT:    vpaddb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT:    vpsllw $1, %xmm2, %xmm3
+; AVX1-NEXT:    vpaddw %xmm2, %xmm2, %xmm3
 ; AVX1-NEXT:    vpand %xmm7, %xmm3, %xmm3
 ; AVX1-NEXT:    vpsrlw $2, %xmm2, %xmm2
 ; AVX1-NEXT:    vpand %xmm2, %xmm8, %xmm2
@@ -633,7 +633,7 @@ define <32 x i8> @test_rem7_32i8(<32 x i8> %a) nounwind {
 ; AVX2NOBW-NEXT:    vpsrlw $1, %ymm2, %ymm2
 ; AVX2NOBW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
 ; AVX2NOBW-NEXT:    vpaddb %ymm1, %ymm2, %ymm1
-; AVX2NOBW-NEXT:    vpsllw $1, %ymm1, %ymm2
+; AVX2NOBW-NEXT:    vpaddw %ymm1, %ymm1, %ymm2
 ; AVX2NOBW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
 ; AVX2NOBW-NEXT:    vpsrlw $2, %ymm1, %ymm1
 ; AVX2NOBW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
@@ -651,7 +651,7 @@ define <32 x i8> @test_rem7_32i8(<32 x i8> %a) nounwind {
 ; AVX512BW-NEXT:    vpsrlw $1, %ymm2, %ymm2
 ; AVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
 ; AVX512BW-NEXT:    vpaddb %ymm1, %ymm2, %ymm1
-; AVX512BW-NEXT:    vpsllw $1, %ymm1, %ymm2
+; AVX512BW-NEXT:    vpaddw %ymm1, %ymm1, %ymm2
 ; AVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
 ; AVX512BW-NEXT:    vpsrlw $2, %ymm1, %ymm1
 ; AVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
diff --git a/llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll b/llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll
index 9c56894..bf98bcc 100644
--- a/llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll
+++ b/llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll
@@ -485,7 +485,7 @@ define <64 x i8> @test_rem7_64i8(<64 x i8> %a) nounwind {
 ; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
 ; AVX512F-NEXT:    vpand %ymm6, %ymm5, %ymm5
 ; AVX512F-NEXT:    vpaddb %ymm3, %ymm5, %ymm3
-; AVX512F-NEXT:    vpsllw $1, %ymm3, %ymm5
+; AVX512F-NEXT:    vpaddw %ymm3, %ymm3, %ymm5
 ; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm7 = [248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248]
 ; AVX512F-NEXT:    vpand %ymm7, %ymm5, %ymm5
 ; AVX512F-NEXT:    vpsrlw $2, %ymm3, %ymm3
@@ -504,7 +504,7 @@ define <64 x i8> @test_rem7_64i8(<64 x i8> %a) nounwind {
 ; AVX512F-NEXT:    vpsrlw $1, %ymm3, %ymm3
 ; AVX512F-NEXT:    vpand %ymm6, %ymm3, %ymm3
 ; AVX512F-NEXT:    vpaddb %ymm2, %ymm3, %ymm2
-; AVX512F-NEXT:    vpsllw $1, %ymm2, %ymm3
+; AVX512F-NEXT:    vpaddw %ymm2, %ymm2, %ymm3
 ; AVX512F-NEXT:    vpand %ymm7, %ymm3, %ymm3
 ; AVX512F-NEXT:    vpsrlw $2, %ymm2, %ymm2
 ; AVX512F-NEXT:    vpand %ymm2, %ymm8, %ymm2
@@ -528,7 +528,7 @@ define <64 x i8> @test_rem7_64i8(<64 x i8> %a) nounwind {
 ; AVX512BW-NEXT:    vpsrlw $1, %zmm2, %zmm2
 ; AVX512BW-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2
 ; AVX512BW-NEXT:    vpaddb %zmm1, %zmm2, %zmm1
-; AVX512BW-NEXT:    vpsllw $1, %zmm1, %zmm2
+; AVX512BW-NEXT:    vpaddw %zmm1, %zmm1, %zmm2
 ; AVX512BW-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $2, %zmm1, %zmm1
 ; AVX512BW-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1
diff --git a/llvm/test/CodeGen/X86/vector-mul.ll b/llvm/test/CodeGen/X86/vector-mul.ll
index 13b21a7..6e1bf25 100644
--- a/llvm/test/CodeGen/X86/vector-mul.ll
+++ b/llvm/test/CodeGen/X86/vector-mul.ll
@@ -821,10 +821,10 @@ define <16 x i16> @madd_v16i16_3(<16 x i16> %a0, <16 x i16> %a1) nounwind {
 ; X86-SSE-NEXT:    andl $-16, %esp
 ; X86-SSE-NEXT:    subl $16, %esp
 ; X86-SSE-NEXT:    movdqa %xmm1, %xmm3
-; X86-SSE-NEXT:    paddw %xmm1, %xmm3
+; X86-SSE-NEXT:    paddw %xmm3, %xmm3
 ; X86-SSE-NEXT:    paddw %xmm3, %xmm1
 ; X86-SSE-NEXT:    movdqa %xmm0, %xmm3
-; X86-SSE-NEXT:    paddw %xmm0, %xmm3
+; X86-SSE-NEXT:    paddw %xmm3, %xmm3
 ; X86-SSE-NEXT:    paddw %xmm2, %xmm0
 ; X86-SSE-NEXT:    paddw %xmm3, %xmm0
 ; X86-SSE-NEXT:    paddw 8(%ebp), %xmm1
@@ -835,9 +835,9 @@ define <16 x i16> @madd_v16i16_3(<16 x i16> %a0, <16 x i16> %a1) nounwind {
 ; X64-SSE-LABEL: madd_v16i16_3:
 ; X64-SSE:       # %bb.0:
 ; X64-SSE-NEXT:    movdqa %xmm1, %xmm4
-; X64-SSE-NEXT:    paddw %xmm1, %xmm4
+; X64-SSE-NEXT:    paddw %xmm4, %xmm4
 ; X64-SSE-NEXT:    movdqa %xmm0, %xmm5
-; X64-SSE-NEXT:    paddw %xmm0, %xmm5
+; X64-SSE-NEXT:    paddw %xmm5, %xmm5
 ; X64-SSE-NEXT:    paddw %xmm2, %xmm0
 ; X64-SSE-NEXT:    paddw %xmm5, %xmm0
 ; X64-SSE-NEXT:    paddw %xmm3, %xmm1
diff --git a/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll b/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll
index 227e000..ab1feba 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll
@@ -907,7 +907,7 @@ define i1 @mask_v8i32_2(<8 x i32> %a0) {
 ; SSE2-LABEL: mask_v8i32_2:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    por %xmm1, %xmm0
-; SSE2-NEXT:    pslld $1, %xmm0
+; SSE2-NEXT:    paddd %xmm0, %xmm0
 ; SSE2-NEXT:    movmskps %xmm0, %eax
 ; SSE2-NEXT:    testl %eax, %eax
 ; SSE2-NEXT:    sete %al
diff --git a/llvm/test/CodeGen/X86/vector-shift-shl-128.ll b/llvm/test/CodeGen/X86/vector-shift-shl-128.ll
index 2b1cf5b..99dac74 100644
--- a/llvm/test/CodeGen/X86/vector-shift-shl-128.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-shl-128.ll
@@ -927,7 +927,7 @@ define <2 x i64> @constant_shift_v2i64(<2 x i64> %a) nounwind {
 ; SSE2-LABEL: constant_shift_v2i64:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
-; SSE2-NEXT:    paddq %xmm0, %xmm1
+; SSE2-NEXT:    paddq %xmm1, %xmm1
 ; SSE2-NEXT:    psllq $7, %xmm0
 ; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
 ; SSE2-NEXT:    retq
@@ -975,7 +975,7 @@ define <2 x i64> @constant_shift_v2i64(<2 x i64> %a) nounwind {
 ; X86-SSE-LABEL: constant_shift_v2i64:
 ; X86-SSE:       # %bb.0:
 ; X86-SSE-NEXT:    movdqa %xmm0, %xmm1
-; X86-SSE-NEXT:    paddq %xmm0, %xmm1
+; X86-SSE-NEXT:    paddq %xmm1, %xmm1
 ; X86-SSE-NEXT:    psllq $7, %xmm0
 ; X86-SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
 ; X86-SSE-NEXT:    retl
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-sse41.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-sse41.ll
index bec3349..3590c4d 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining-sse41.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-sse41.ll
@@ -62,15 +62,12 @@ define <4 x i32> @combine_blend_of_permutes_v4i32(<2 x i64> %a0, <2 x i64> %a1)
 define <4 x float> @freeze_insertps(<4 x float> %a0, <4 x float> %a1) {
 ; SSE-LABEL: freeze_insertps:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
-; SSE-NEXT:    insertps {{.*#+}} xmm1 = xmm0[1],xmm1[1,2,3]
 ; SSE-NEXT:    movaps %xmm1, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: freeze_insertps:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
-; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[1],xmm1[1,2,3]
+; AVX-NEXT:    vmovaps %xmm1, %xmm0
 ; AVX-NEXT:    retq
   %s0 = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a0, <4 x float> %a1, i8 16)
   %f0 = freeze <4 x float> %s0
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll
index 5b61de5..ee9d8a5 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll
@@ -3550,14 +3550,14 @@ define <8 x i16> @PR141475(i32 %in) {
 ; SSE-LABEL: PR141475:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movd %edi, %xmm0
-; SSE-NEXT:    pslld $1, %xmm0
+; SSE-NEXT:    paddd %xmm0, %xmm0
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: PR141475:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vmovd %edi, %xmm0
-; AVX-NEXT:    vpslld $1, %xmm0, %xmm0
+; AVX-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
 ; AVX-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
 ; AVX-NEXT:    retq
   %mul = shl i32 %in, 1
diff --git a/llvm/test/CodeGen/X86/vector_splat-const-shift-of-constmasked.ll b/llvm/test/CodeGen/X86/vector_splat-const-shift-of-constmasked.ll
index 54dc107..3b93734 100644
--- a/llvm/test/CodeGen/X86/vector_splat-const-shift-of-constmasked.ll
+++ b/llvm/test/CodeGen/X86/vector_splat-const-shift-of-constmasked.ll
@@ -1438,26 +1438,26 @@ define <8 x i16> @test_128_i16_x_8_65024_mask_ashr_10(<8 x i16> %a0) {
 define <8 x i16> @test_128_i16_x_8_127_mask_shl_1(<8 x i16> %a0) {
 ; X86-SSE2-LABEL: test_128_i16_x_8_127_mask_shl_1:
 ; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
 ; X86-SSE2-NEXT:    paddw %xmm0, %xmm0
+; X86-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
 ; X86-SSE2-NEXT:    retl
 ;
 ; X86-AVX-LABEL: test_128_i16_x_8_127_mask_shl_1:
 ; X86-AVX:       # %bb.0:
-; X86-AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
 ; X86-AVX-NEXT:    vpaddw %xmm0, %xmm0, %xmm0
+; X86-AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
 ; X86-AVX-NEXT:    retl
 ;
 ; X64-SSE2-LABEL: test_128_i16_x_8_127_mask_shl_1:
 ; X64-SSE2:       # %bb.0:
-; X64-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; X64-SSE2-NEXT:    paddw %xmm0, %xmm0
+; X64-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; X64-SSE2-NEXT:    retq
 ;
 ; X64-AVX-LABEL: test_128_i16_x_8_127_mask_shl_1:
 ; X64-AVX:       # %bb.0:
-; X64-AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; X64-AVX-NEXT:    vpaddw %xmm0, %xmm0, %xmm0
+; X64-AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; X64-AVX-NEXT:    retq
   %t0 = and <8 x i16> %a0, <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>
   %t1 = shl <8 x i16> %t0, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
@@ -1656,26 +1656,26 @@ define <8 x i16> @test_128_i16_x_8_2032_mask_shl_6(<8 x i16> %a0) {
 define <8 x i16> @test_128_i16_x_8_65024_mask_shl_1(<8 x i16> %a0) {
 ; X86-SSE2-LABEL: test_128_i16_x_8_65024_mask_shl_1:
 ; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
 ; X86-SSE2-NEXT:    paddw %xmm0, %xmm0
+; X86-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
 ; X86-SSE2-NEXT:    retl
 ;
 ; X86-AVX-LABEL: test_128_i16_x_8_65024_mask_shl_1:
 ; X86-AVX:       # %bb.0:
-; X86-AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
 ; X86-AVX-NEXT:    vpaddw %xmm0, %xmm0, %xmm0
+; X86-AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
 ; X86-AVX-NEXT:    retl
 ;
 ; X64-SSE2-LABEL: test_128_i16_x_8_65024_mask_shl_1:
 ; X64-SSE2:       # %bb.0:
-; X64-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; X64-SSE2-NEXT:    paddw %xmm0, %xmm0
+; X64-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; X64-SSE2-NEXT:    retq
 ;
 ; X64-AVX-LABEL: test_128_i16_x_8_65024_mask_shl_1:
 ; X64-AVX:       # %bb.0:
-; X64-AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; X64-AVX-NEXT:    vpaddw %xmm0, %xmm0, %xmm0
+; X64-AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; X64-AVX-NEXT:    retq
   %t0 = and <8 x i16> %a0, <i16 65024, i16 65024, i16 65024, i16 65024, i16 65024, i16 65024, i16 65024, i16 65024>
   %t1 = shl <8 x i16> %t0, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
@@ -2373,40 +2373,40 @@ define <4 x i32> @test_128_i32_x_4_4294836224_mask_ashr_18(<4 x i32> %a0) {
 define <4 x i32> @test_128_i32_x_4_32767_mask_shl_1(<4 x i32> %a0) {
 ; X86-SSE2-LABEL: test_128_i32_x_4_32767_mask_shl_1:
 ; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
 ; X86-SSE2-NEXT:    paddd %xmm0, %xmm0
+; X86-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
 ; X86-SSE2-NEXT:    retl
 ;
 ; X86-AVX1-LABEL: test_128_i32_x_4_32767_mask_shl_1:
 ; X86-AVX1:       # %bb.0:
-; X86-AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
 ; X86-AVX1-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
 ; X86-AVX1-NEXT:    retl
 ;
 ; X86-AVX2-LABEL: test_128_i32_x_4_32767_mask_shl_1:
 ; X86-AVX2:       # %bb.0:
-; X86-AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [32767,32767,32767,32767]
-; X86-AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; X86-AVX2-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
+; X86-AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [65534,65534,65534,65534]
+; X86-AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; X86-AVX2-NEXT:    retl
 ;
 ; X64-SSE2-LABEL: test_128_i32_x_4_32767_mask_shl_1:
 ; X64-SSE2:       # %bb.0:
-; X64-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; X64-SSE2-NEXT:    paddd %xmm0, %xmm0
+; X64-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; X64-SSE2-NEXT:    retq
 ;
 ; X64-AVX1-LABEL: test_128_i32_x_4_32767_mask_shl_1:
 ; X64-AVX1:       # %bb.0:
-; X64-AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; X64-AVX1-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; X64-AVX1-NEXT:    retq
 ;
 ; X64-AVX2-LABEL: test_128_i32_x_4_32767_mask_shl_1:
 ; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [32767,32767,32767,32767]
-; X64-AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; X64-AVX2-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
+; X64-AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [65534,65534,65534,65534]
+; X64-AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; X64-AVX2-NEXT:    retq
   %t0 = and <4 x i32> %a0, <i32 32767, i32 32767, i32 32767, i32 32767>
   %t1 = shl <4 x i32> %t0, <i32 1, i32 1, i32 1, i32 1>
@@ -2675,40 +2675,40 @@ define <4 x i32> @test_128_i32_x_4_8388352_mask_shl_10(<4 x i32> %a0) {
 define <4 x i32> @test_128_i32_x_4_4294836224_mask_shl_1(<4 x i32> %a0) {
 ; X86-SSE2-LABEL: test_128_i32_x_4_4294836224_mask_shl_1:
 ; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
 ; X86-SSE2-NEXT:    paddd %xmm0, %xmm0
+; X86-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
 ; X86-SSE2-NEXT:    retl
 ;
 ; X86-AVX1-LABEL: test_128_i32_x_4_4294836224_mask_shl_1:
 ; X86-AVX1:       # %bb.0:
-; X86-AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
 ; X86-AVX1-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
 ; X86-AVX1-NEXT:    retl
 ;
 ; X86-AVX2-LABEL: test_128_i32_x_4_4294836224_mask_shl_1:
 ; X86-AVX2:       # %bb.0:
-; X86-AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [4294836224,4294836224,4294836224,4294836224]
-; X86-AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; X86-AVX2-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
+; X86-AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [4294705152,4294705152,4294705152,4294705152]
+; X86-AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; X86-AVX2-NEXT:    retl
 ;
 ; X64-SSE2-LABEL: test_128_i32_x_4_4294836224_mask_shl_1:
 ; X64-SSE2:       # %bb.0:
-; X64-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; X64-SSE2-NEXT:    paddd %xmm0, %xmm0
+; X64-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; X64-SSE2-NEXT:    retq
 ;
 ; X64-AVX1-LABEL: test_128_i32_x_4_4294836224_mask_shl_1:
 ; X64-AVX1:       # %bb.0:
-; X64-AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; X64-AVX1-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; X64-AVX1-NEXT:    retq
 ;
 ; X64-AVX2-LABEL: test_128_i32_x_4_4294836224_mask_shl_1:
 ; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [4294836224,4294836224,4294836224,4294836224]
-; X64-AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; X64-AVX2-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
+; X64-AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [4294705152,4294705152,4294705152,4294705152]
+; X64-AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; X64-AVX2-NEXT:    retq
   %t0 = and <4 x i32> %a0, <i32 4294836224, i32 4294836224, i32 4294836224, i32 4294836224>
   %t1 = shl <4 x i32> %t0, <i32 1, i32 1, i32 1, i32 1>
@@ -3325,26 +3325,26 @@ define <2 x i64> @test_128_i64_x_2_18446744065119617024_mask_ashr_34(<2 x i64> %
 define <2 x i64> @test_128_i64_x_2_2147483647_mask_shl_1(<2 x i64> %a0) {
 ; X86-SSE2-LABEL: test_128_i64_x_2_2147483647_mask_shl_1:
 ; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
 ; X86-SSE2-NEXT:    paddq %xmm0, %xmm0
+; X86-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
 ; X86-SSE2-NEXT:    retl
 ;
 ; X86-AVX-LABEL: test_128_i64_x_2_2147483647_mask_shl_1:
 ; X86-AVX:       # %bb.0:
-; X86-AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
 ; X86-AVX-NEXT:    vpaddq %xmm0, %xmm0, %xmm0
+; X86-AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
 ; X86-AVX-NEXT:    retl
 ;
 ; X64-SSE2-LABEL: test_128_i64_x_2_2147483647_mask_shl_1:
 ; X64-SSE2:       # %bb.0:
-; X64-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; X64-SSE2-NEXT:    paddq %xmm0, %xmm0
+; X64-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; X64-SSE2-NEXT:    retq
 ;
 ; X64-AVX-LABEL: test_128_i64_x_2_2147483647_mask_shl_1:
 ; X64-AVX:       # %bb.0:
-; X64-AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; X64-AVX-NEXT:    vpaddq %xmm0, %xmm0, %xmm0
+; X64-AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; X64-AVX-NEXT:    retq
   %t0 = and <2 x i64> %a0, <i64 2147483647, i64 2147483647>
   %t1 = shl <2 x i64> %t0, <i64 1, i64 1>
@@ -3543,26 +3543,26 @@ define <2 x i64> @test_128_i64_x_2_140737488289792_mask_shl_18(<2 x i64> %a0) {
 define <2 x i64> @test_128_i64_x_2_18446744065119617024_mask_shl_1(<2 x i64> %a0) {
 ; X86-SSE2-LABEL: test_128_i64_x_2_18446744065119617024_mask_shl_1:
 ; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
 ; X86-SSE2-NEXT:    paddq %xmm0, %xmm0
+; X86-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
 ; X86-SSE2-NEXT:    retl
 ;
 ; X86-AVX-LABEL: test_128_i64_x_2_18446744065119617024_mask_shl_1:
 ; X86-AVX:       # %bb.0:
-; X86-AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
 ; X86-AVX-NEXT:    vpaddq %xmm0, %xmm0, %xmm0
+; X86-AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
 ; X86-AVX-NEXT:    retl
 ;
 ; X64-SSE2-LABEL: test_128_i64_x_2_18446744065119617024_mask_shl_1:
 ; X64-SSE2:       # %bb.0:
-; X64-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; X64-SSE2-NEXT:    paddq %xmm0, %xmm0
+; X64-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; X64-SSE2-NEXT:    retq
 ;
 ; X64-AVX-LABEL: test_128_i64_x_2_18446744065119617024_mask_shl_1:
 ; X64-AVX:       # %bb.0:
-; X64-AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; X64-AVX-NEXT:    vpaddq %xmm0, %xmm0, %xmm0
+; X64-AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; X64-AVX-NEXT:    retq
   %t0 = and <2 x i64> %a0, <i64 18446744065119617024, i64 18446744065119617024>
   %t1 = shl <2 x i64> %t0, <i64 1, i64 1>
diff --git a/llvm/test/DebugInfo/AArch64/abstract-sp-unit.ll b/llvm/test/DebugInfo/AArch64/abstract-sp-unit.ll
new file mode 100644
index 0000000..559f201
--- /dev/null
+++ b/llvm/test/DebugInfo/AArch64/abstract-sp-unit.ll
@@ -0,0 +1,43 @@
+; RUN: llc --filetype=obj -O0 -o - %s | llvm-dwarfdump --verify -
+
+; Check that abstract DIE for a subprogram referenced from another compile unit
+; is emitted in the correct CU.
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64"
+
+define void @a() !dbg !10 {
+  br label %for.b.c.c, !dbg !13
+  for.b.c.c:
+    br label %for.b.c.c
+}
+
+!llvm.dbg.cu = !{!0, !6}
+!llvm.module.flags = !{!8}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_20, file: !1, emissionKind: FullDebug, globals: !2)
+!1 = !DIFile(filename: "foo.cpp", directory: "")
+!2 = !{!3}
+!3 = !DIGlobalVariableExpression(var: !4, expr: !DIExpression())
+!4 = !DIGlobalVariable(type: !5)
+!5 = !DICompositeType(tag: DW_TAG_class_type)
+!6 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_20, file: !7, emissionKind: FullDebug)
+!7 = !DIFile(filename: "bar.cpp", directory: "")
+!8 = !{i32 2, !"Debug Info Version", i32 3}
+!10 = distinct !DISubprogram(type: !11, unit: !6)
+!11 = !DISubroutineType(types: !12)
+!12 = !{}
+!13 = !DILocation(scope: !14, inlinedAt: !15)
+!14 = distinct !DISubprogram(unit: !6)
+!15 = !DILocation(scope: !16, inlinedAt: !25)
+!16 = distinct !DISubprogram(type: !11, unit: !6, declaration: !17)
+!17 = !DISubprogram(scope: !5, type: !11, spFlags: DISPFlagOptimized, templateParams: !18)
+!18 = !{!19}
+!19 = !DITemplateTypeParameter(type: !20)
+!20 = !DICompositeType(tag: DW_TAG_class_type, scope: !21)
+!21 = distinct !DISubprogram(unit: !6, retainedNodes: !22)
+!22 = !{!23}
+!23 = !DILocalVariable(scope: !21, type: !24)
+!24 = !DIBasicType()
+!25 = !DILocation(scope: !21, inlinedAt: !26)
+!26 = !DILocation(scope: !10)
diff --git a/llvm/test/DebugInfo/AArch64/debug-types.ll b/llvm/test/DebugInfo/AArch64/debug-types.ll
new file mode 100644
index 0000000..0d0fd33
--- /dev/null
+++ b/llvm/test/DebugInfo/AArch64/debug-types.ll
@@ -0,0 +1,59 @@
+; Check that composite type DIEs go to debug_types section.
+
+; RUN: llc -generate-type-units -filetype=obj %s -o - | llvm-dwarfdump -debug-info -debug-types - | FileCheck %s
+
+; CHECK: .debug_info contents:
+; CHECK: DW_TAG_compile_unit
+; CHECK: DW_TAG_class_type
+; CHECK: DW_AT_signature ([[SIG_A:0x[0-9a-f]+]])
+; CHECK: DW_TAG_subprogram
+; CHECK: NULL
+; CHECK: DW_TAG_subprogram
+; CHECK: "_ZN1A6AppendEv"
+; CHECK: DW_TAG_class_type
+; CHECK: DW_AT_signature ([[SIG_LAMBDA:0x[0-9a-f]+]])
+; CHECK: DW_TAG_variable
+; CHECK: NULL
+; CHECK: DW_TAG_subprogram
+; CHECK: DW_TAG_inlined_subroutine
+; CHECK: NULL
+; CHECK: NULL
+
+; CHECK:      .debug_types contents:
+; CHECK:      Type Unit: {{.*}} type_signature = [[SIG_A]]
+; CHECK:      DW_TAG_class_type
+; CHECK-NOT:    DW_TAG
+; CHECK:        DW_AT_name ("A")
+; CHECK:      Type Unit: {{.*}} type_signature = [[SIG_LAMBDA]]
+; CHECK:      DW_TAG_class_type
+; CHECK:      DW_TAG_class_type
+; CHECK-NOT:    DW_TAG
+; CHECK:        DW_AT_decl_line (7)
+
+target triple = "aarch64-unknown-linux-gnu"
+
+define void @_Z1f1A() !dbg !4 {
+entry:
+  ret void, !dbg !8
+}
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, emissionKind: FullDebug, globals: !2)
+!1 = !DIFile(filename: "<stdin>", directory: "")
+!2 = !{}
+!3 = !{i32 2, !"Debug Info Version", i32 3}
+!4 = distinct !DISubprogram(name: "f", linkageName: "_Z1f1A", scope: !5, file: !5, line: 14, type: !6, scopeLine: 14, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2)
+!5 = !DIFile(filename: "repro.ii", directory: "")
+!6 = distinct !DISubroutineType(types: !7)
+!7 = !{null}
+!8 = !DILocation(line: 8, column: 12, scope: !9, inlinedAt: !16)
+!9 = distinct !DISubprogram(name: "Append", linkageName: "_ZN1A6AppendEv", scope: !10, file: !5, line: 6, type: !11, scopeLine: 6, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, declaration: !12, retainedNodes: !13)
+!10 = distinct !DICompositeType(tag: DW_TAG_class_type, name: "A", file: !5, line: 3, size: 32, flags: DIFlagTypePassByValue, elements: !2, identifier: "_ZTS1A")
+!11 = distinct !DISubroutineType(types: !7)
+!12 = !DISubprogram(name: "Append", linkageName: "_ZN1A6AppendEv", scope: !10, file: !5, line: 6, type: !11, scopeLine: 6, flags: DIFlagPublic | DIFlagPrototyped, spFlags: DISPFlagOptimized)
+!13 = !{!14}
+!14 = !DILocalVariable(name: "raw_append", scope: !9, file: !5, line: 7, type: !15)
+!15 = distinct !DICompositeType(tag: DW_TAG_class_type, scope: !9, file: !5, line: 7, size: 8, flags: DIFlagTypePassByValue | DIFlagNonTrivial, elements: !2, identifier: "_ZTSZN1A6AppendEvEUlvE_")
+!16 = distinct !DILocation(line: 14, column: 15, scope: !4)
diff --git a/llvm/test/DebugInfo/AArch64/populate-abstract-sp-once.ll b/llvm/test/DebugInfo/AArch64/populate-abstract-sp-once.ll
new file mode 100644
index 0000000..20cc98a
--- /dev/null
+++ b/llvm/test/DebugInfo/AArch64/populate-abstract-sp-once.ll
@@ -0,0 +1,67 @@
+; Check that abstract DIEs for inlined subprograms and lexical scopes
+; are populated only once.
+
+; RUN: llc -filetype=obj %s -o - | llvm-dwarfdump - -o - | FileCheck --implicit-check-not=DW_TAG_lexical_scope --implicit-check-not DW_TAG_subprogram %s
+
+; CHECK:  DW_TAG_compile_unit
+; CHECK:    DW_TAG_namespace
+; CHECK:      DW_TAG_subprogram
+; CHECK:        DW_AT_declaration (true)
+; CHECK:      DW_TAG_subprogram
+; CHECK:        DW_AT_declaration (true)
+; CHECK:      DW_TAG_subprogram
+; CHECK:        DW_AT_declaration (true)
+; CHECK:      NULL
+
+; CHECK:  [[ABSTRACT_SP:0x[0-9a-f]+]]: DW_TAG_subprogram
+; CHECK:    DW_AT_inline (DW_INL_inlined)
+
+; CHECK:    DW_TAG_lexical_block
+; CHECK:      DW_TAG_imported_module
+; CHECK:      NULL
+
+; CHECK:    NULL
+
+; CHECK:  DW_TAG_subprogram
+; CHECK:    DW_TAG_inlined_subroutine
+; CHECK:      DW_AT_abstract_origin ([[ABSTRACT_SP]]
+; CHECK:    NULL
+; CHECK:  DW_TAG_subprogram
+; CHECK:    DW_TAG_inlined_subroutine
+; CHECK:      DW_AT_abstract_origin ([[ABSTRACT_SP]]
+; CHECK:    NULL
+
+target triple = "aarch64-unknown-linux-gnu"
+
+define void @_ZN12_GLOBAL__N_117MapRegionCounters14TraverseIfStmtEPN5clang6IfStmtE() !dbg !4 {
+entry:
+  ret void, !dbg !8
+}
+
+define void @_ZN12_GLOBAL__N_117MapRegionCounters9VisitStmtEPN5clang4StmtE() !dbg !15 {
+entry:
+  ret void, !dbg !17
+}
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, retainedTypes: !2, globals: !2, imports: !2)
+!1 = !DIFile(filename: "CodeGenPGO.cpp", directory: "/")
+!2 = !{}
+!3 = !{i32 2, !"Debug Info Version", i32 3}
+!4 = distinct !DISubprogram(name: "TraverseIfStmt", linkageName: "_ZN12_GLOBAL__N_117MapRegionCounters14TraverseIfStmtEPN5clang6IfStmtE", scope: !5, file: !1, line: 364, type: !6, scopeLine: 364, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, unit: !0, declaration: !7, retainedNodes: !2, keyInstructions: true)
+!5 = !DINamespace(name: "llvm", scope: null)
+!6 = distinct !DISubroutineType(types: !2)
+!7 = !DISubprogram(name: "TraverseIfStmt", linkageName: "_ZN12_GLOBAL__N_117MapRegionCounters14TraverseIfStmtEPN5clang6IfStmtE", scope: !5, file: !1, line: 364, type: !6, scopeLine: 364, flags: DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagOptimized)
+!8 = !DILocation(line: 982, column: 39, scope: !9, inlinedAt: !14, atomGroup: 6, atomRank: 2)
+!9 = distinct !DISubprogram(name: "combine", linkageName: "_ZN12_GLOBAL__N_17PGOHash7combineENS0_8HashTypeE", scope: !5, file: !1, line: 966, type: !6, scopeLine: 966, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, unit: !0, declaration: !10, retainedNodes: !11, keyInstructions: true)
+!10 = !DISubprogram(name: "combine", linkageName: "_ZN12_GLOBAL__N_17PGOHash7combineENS0_8HashTypeE", scope: !5, file: !1, line: 140, type: !6, scopeLine: 140, flags: DIFlagPublic | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagOptimized)
+!11 = !{!12}
+!12 = !DIImportedEntity(tag: DW_TAG_imported_module, scope: !13, entity: !5, file: !1, line: 973)
+!13 = distinct !DILexicalBlock(scope: !9, file: !1, line: 972, column: 7)
+!14 = distinct !DILocation(line: 393, column: 10, scope: !4)
+!15 = distinct !DISubprogram(name: "VisitStmt", linkageName: "_ZN12_GLOBAL__N_117MapRegionCounters9VisitStmtEPN5clang4StmtE", scope: !5, file: !1, line: 355, type: !6, scopeLine: 355, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, unit: !0, declaration: !16, retainedNodes: !2, keyInstructions: true)
+!16 = !DISubprogram(name: "VisitStmt", linkageName: "_ZN12_GLOBAL__N_117MapRegionCounters9VisitStmtEPN5clang4StmtE", scope: !5, file: !1, line: 355, type: !6, scopeLine: 355, flags: DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagOptimized)
+!17 = !DILocation(line: 982, column: 13, scope: !9, inlinedAt: !18)
+!18 = distinct !DILocation(line: 360, column: 12, scope: !15)
diff --git a/llvm/test/DebugInfo/Generic/inlined-static-var.ll b/llvm/test/DebugInfo/Generic/inlined-static-var.ll
new file mode 100644
index 0000000..1d24646
--- /dev/null
+++ b/llvm/test/DebugInfo/Generic/inlined-static-var.ll
@@ -0,0 +1,93 @@
+; RUN: %llc_dwarf -O0 -filetype=obj < %s | llvm-dwarfdump -debug-info - | FileCheck --implicit-check-not "{{DW_TAG|NULL}}" %s
+
+; inline __attribute__((always_inline))
+; int removed() { static int A; return A++; }
+;
+; __attribute__((always_inline))
+; int not_removed() { static int B; return B++; }
+;
+; int foo() { return removed() + not_removed(); }
+
+; Ensure that global variables belong to the correct subprograms even if those
+; subprograms are inlined.
+
+; CHECK: DW_TAG_compile_unit
+; CHECK:   DW_TAG_subprogram
+; CHECK:     DW_AT_abstract_origin {{.*}} "_Z11not_removedv"
+; TODO: This variable should be emitted in abstract subprogram DIE.
+; CHECK:     DW_TAG_variable
+; CHECK:       DW_AT_name     ("B")
+; CHECK:     NULL
+; CHECK:   DW_TAG_base_type
+; CHECK:   DW_TAG_subprogram
+; CHECK:     DW_AT_name       ("removed")
+; CHECK:     DW_TAG_variable
+; CHECK:       DW_AT_name     ("A")
+; CHECK:     NULL
+; CHECK:   DW_TAG_subprogram
+; CHECK:     DW_AT_name       ("not_removed")
+; CHECK:   DW_TAG_subprogram
+; CHECK:     DW_AT_name       ("foo")
+; CHECK:     DW_TAG_inlined_subroutine
+; CHECK:     DW_TAG_inlined_subroutine
+; CHECK:     NULL
+; CHECK:   NULL
+
+@_ZZ11not_removedvE1A = internal global i32 0, align 4, !dbg !0
+@_ZZ7removedvE1A = linkonce_odr dso_local global i32 0, align 4, !dbg !10
+
+define dso_local i32 @_Z11not_removedv() !dbg !2 {
+  %1 = load i32, i32* @_ZZ11not_removedvE1A, align 4, !dbg !24
+  %2 = add nsw i32 %1, 1, !dbg !24
+  store i32 %2, i32* @_ZZ11not_removedvE1A, align 4, !dbg !24
+  ret i32 %1, !dbg !25
+}
+
+define dso_local i32 @_Z3foov() !dbg !26 {
+  %1 = load i32, i32* @_ZZ7removedvE1A, align 4, !dbg !27
+  %2 = add nsw i32 %1, 1, !dbg !27
+  store i32 %2, i32* @_ZZ7removedvE1A, align 4, !dbg !27
+  %3 = load i32, i32* @_ZZ11not_removedvE1A, align 4, !dbg !29
+  %4 = add nsw i32 %3, 1, !dbg !29
+  store i32 %4, i32* @_ZZ11not_removedvE1A, align 4, !dbg !29
+  %5 = add nsw i32 %1, %3, !dbg !31
+  ret i32 %5, !dbg !32
+}
+
+!llvm.dbg.cu = !{!7}
+!llvm.module.flags = !{!14, !15, !16, !17, !18, !19, !20, !21, !22}
+!llvm.ident = !{!23}
+
+!0 = !DIGlobalVariableExpression(var: !1, expr: !DIExpression())
+!1 = distinct !DIGlobalVariable(name: "B", scope: !2, file: !3, line: 5, type: !6, isLocal: true, isDefinition: true)
+!2 = distinct !DISubprogram(name: "not_removed", linkageName: "_Z11not_removedv", scope: !3, file: !3, line: 5, type: !4, scopeLine: 5, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !7, retainedNodes: !13)
+!3 = !DIFile(filename: "example.cpp", directory: "")
+!4 = !DISubroutineType(types: !5)
+!5 = !{!6}
+!6 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!7 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !8, producer: "clang version 14.0.0", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, globals: !9, splitDebugInlining: false, nameTableKind: None)
+!8 = !DIFile(filename: "example.cpp", directory: "")
+!9 = !{!0, !10}
+!10 = !DIGlobalVariableExpression(var: !11, expr: !DIExpression())
+!11 = distinct !DIGlobalVariable(name: "A", scope: !12, file: !3, line: 2, type: !6, isLocal: false, isDefinition: true)
+!12 = distinct !DISubprogram(name: "removed", linkageName: "_Z7removedv", scope: !3, file: !3, line: 2, type: !4, scopeLine: 2, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !7, retainedNodes: !13)
+!13 = !{}
+!14 = !{i32 7, !"Dwarf Version", i32 4}
+!15 = !{i32 2, !"Debug Info Version", i32 3}
+!16 = !{i32 1, !"wchar_size", i32 4}
+!17 = !{i32 1, !"branch-target-enforcement", i32 0}
+!18 = !{i32 1, !"sign-return-address", i32 0}
+!19 = !{i32 1, !"sign-return-address-all", i32 0}
+!20 = !{i32 1, !"sign-return-address-with-bkey", i32 0}
+!21 = !{i32 7, !"uwtable", i32 1}
+!22 = !{i32 7, !"frame-pointer", i32 1}
+!23 = !{!"clang version 14.0.0"}
+!24 = !DILocation(line: 5, column: 43, scope: !2)
+!25 = !DILocation(line: 5, column: 35, scope: !2)
+!26 = distinct !DISubprogram(name: "foo", linkageName: "_Z3foov", scope: !3, file: !3, line: 7, type: !4, scopeLine: 7, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !7, retainedNodes: !13)
+!27 = !DILocation(line: 2, column: 39, scope: !12, inlinedAt: !28)
+!28 = distinct !DILocation(line: 7, column: 20, scope: !26)
+!29 = !DILocation(line: 5, column: 43, scope: !2, inlinedAt: !30)
+!30 = distinct !DILocation(line: 7, column: 32, scope: !26)
+!31 = !DILocation(line: 7, column: 30, scope: !26)
+!32 = !DILocation(line: 7, column: 13, scope: !26)
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_salu_lit64.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_salu_lit64.txt
index d2ec213..7064479 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_salu_lit64.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_salu_lit64.txt
@@ -1,55 +1,56 @@
+# NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5
 # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX1250 %s
 
-# GFX1250: s_mov_b64 s[2:3], lit64(0x10abcdef12345678)    ; encoding: [0xfe,0x01,0x82,0xbe,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0xfe,0x01,0x82,0xbe,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: s_mov_b64 s[2:3], lit64(0x10abcdef12345678) ; encoding: [0xfe,0x01,0x82,0xbe,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: s_add_nc_u64 s[2:3], s[4:5], lit64(0x10abcdef12345678) ; encoding: [0x04,0xfe,0x82,0xa9,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0x04,0xfe,0x82,0xa9,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: s_add_nc_u64 s[2:3], s[4:5], lit64(0x10abcdef12345678) ; encoding: [0x04,0xfe,0x82,0xa9,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: s_and_b64 s[2:3], lit64(0x10abcdef12345678), s[4:5] ; encoding: [0xfe,0x04,0x82,0x8b,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0xfe,0x04,0x82,0x8b,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: s_and_b64 s[2:3], lit64(0x10abcdef12345678), s[4:5] ; encoding: [0xfe,0x04,0x82,0x8b,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: s_and_not1_b64 s[2:3], lit64(0x10abcdef12345678), lit64(0x10abcdef12345678) ; encoding: [0xfe,0xfe,0x82,0x91,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0xfe,0xfe,0x82,0x91,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: s_and_not1_b64 s[2:3], lit64(0x10abcdef12345678), lit64(0x10abcdef12345678) ; encoding: [0xfe,0xfe,0x82,0x91,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: s_and_not1_b64 s[2:3], lit64(0x10abcdef12345678), s[4:5] ; encoding: [0xfe,0x04,0x82,0x91,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0xfe,0x04,0x82,0x91,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: s_and_not1_b64 s[2:3], lit64(0x10abcdef12345678), s[4:5] ; encoding: [0xfe,0x04,0x82,0x91,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: s_ashr_i64 s[2:3], lit64(0x10abcdef12345678), s4 ; encoding: [0xfe,0x04,0x82,0x86,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0xfe,0x04,0x82,0x86,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: s_ashr_i64 s[2:3], lit64(0x10abcdef12345678), s4 ; encoding: [0xfe,0x04,0x82,0x86,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: s_bfe_i64 s[2:3], lit64(0x80abcdef12345678), 5 ; encoding: [0xfe,0x85,0x82,0x94,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x80]
 0xfe,0x85,0x82,0x94,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x80
+# GFX1250: s_bfe_i64 s[2:3], lit64(0x80abcdef12345678), 5 ; encoding: [0xfe,0x85,0x82,0x94,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x80]
 
-# GFX1250: s_bfe_u64 s[2:3], lit64(0x10abcdef12345678), 5 ; encoding: [0xfe,0x85,0x02,0x94,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0xfe,0x85,0x02,0x94,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: s_bfe_u64 s[2:3], lit64(0x10abcdef12345678), 5 ; encoding: [0xfe,0x85,0x02,0x94,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: s_cselect_b64 s[2:3], s[4:5], lit64(0x10abcdef12345678) ; encoding: [0x04,0xfe,0x82,0x98,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0x04,0xfe,0x82,0x98,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: s_cselect_b64 s[2:3], s[4:5], lit64(0x10abcdef12345678) ; encoding: [0x04,0xfe,0x82,0x98,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: s_lshl_b64 s[2:3], lit64(0x10abcdef12345678), s4 ; encoding: [0xfe,0x04,0x82,0x84,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0xfe,0x04,0x82,0x84,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: s_lshl_b64 s[2:3], lit64(0x10abcdef12345678), s4 ; encoding: [0xfe,0x04,0x82,0x84,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: s_lshr_b64 s[2:3], lit64(0x10abcdef12345678), s4 ; encoding: [0xfe,0x04,0x82,0x85,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0xfe,0x04,0x82,0x85,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: s_lshr_b64 s[2:3], lit64(0x10abcdef12345678), s4 ; encoding: [0xfe,0x04,0x82,0x85,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: s_mul_u64 s[2:3], lit64(0x10abcdef12345678), s[4:5] ; encoding: [0xfe,0x04,0x82,0xaa,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0xfe,0x04,0x82,0xaa,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: s_mul_u64 s[2:3], lit64(0x10abcdef12345678), s[4:5] ; encoding: [0xfe,0x04,0x82,0xaa,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: s_nand_b64 s[2:3], s[4:5], lit64(0x10abcdef12345678) ; encoding: [0x04,0xfe,0x82,0x8e,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0x04,0xfe,0x82,0x8e,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: s_nand_b64 s[2:3], s[4:5], lit64(0x10abcdef12345678) ; encoding: [0x04,0xfe,0x82,0x8e,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: s_nor_b64 s[2:3], s[4:5], lit64(0x10abcdef12345678) ; encoding: [0x04,0xfe,0x82,0x8f,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0x04,0xfe,0x82,0x8f,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: s_nor_b64 s[2:3], s[4:5], lit64(0x10abcdef12345678) ; encoding: [0x04,0xfe,0x82,0x8f,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: s_or_b64 s[2:3], s[4:5], lit64(0x10abcdef12345678) ; encoding: [0x04,0xfe,0x82,0x8c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0x04,0xfe,0x82,0x8c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: s_or_b64 s[2:3], s[4:5], lit64(0x10abcdef12345678) ; encoding: [0x04,0xfe,0x82,0x8c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: s_or_not1_b64 s[2:3], s[4:5], lit64(0x10abcdef12345678) ; encoding: [0x04,0xfe,0x82,0x92,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0x04,0xfe,0x82,0x92,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: s_or_not1_b64 s[2:3], s[4:5], lit64(0x10abcdef12345678) ; encoding: [0x04,0xfe,0x82,0x92,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: s_xnor_b64 s[2:3], s[4:5], lit64(0x10abcdef12345678) ; encoding: [0x04,0xfe,0x82,0x90,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0x04,0xfe,0x82,0x90,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: s_xnor_b64 s[2:3], s[4:5], lit64(0x10abcdef12345678) ; encoding: [0x04,0xfe,0x82,0x90,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: s_xor_b64 s[2:3], lit64(0x10abcdef12345678), s[4:5] ; encoding: [0xfe,0x04,0x82,0x8d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0xfe,0x04,0x82,0x8d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: s_xor_b64 s[2:3], lit64(0x10abcdef12345678), s[4:5] ; encoding: [0xfe,0x04,0x82,0x8d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_sop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_sop1.txt
index 963e693..227e1c4 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_sop1.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_sop1.txt
@@ -1,34 +1,35 @@
+# NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5
 # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX1250 %s
 
-# GFX1250: s_add_pc_i64 lit64(0x12345678abcd0)            ; encoding: [0xfe,0x4b,0x80,0xbe,0xd0,0xbc,0x8a,0x67,0x45,0x23,0x01,0x00]
 0xfe,0x4b,0x80,0xbe,0xd0,0xbc,0x8a,0x67,0x45,0x23,0x01,0x00
+# GFX1250: s_add_pc_i64 lit64(0x12345678abcd0)     ; encoding: [0xfe,0x4b,0x80,0xbe,0xd0,0xbc,0x8a,0x67,0x45,0x23,0x01,0x00]
 
-# GFX1250: s_add_pc_i64 0x64                       ; encoding: [0xff,0x4b,0x80,0xbe,0x64,0x00,0x00,0x00]
 0xff,0x4b,0x80,0xbe,0x64,0x00,0x00,0x00
+# GFX1250: s_add_pc_i64 0x64                       ; encoding: [0xff,0x4b,0x80,0xbe,0x64,0x00,0x00,0x00]
 
-# GFX1250: s_add_pc_i64 4                          ; encoding: [0x84,0x4b,0x80,0xbe]
 0x84,0x4b,0x80,0xbe
+# GFX1250: s_add_pc_i64 4                          ; encoding: [0x84,0x4b,0x80,0xbe]
 
-# GFX1250: s_add_pc_i64 s[2:3]                     ; encoding: [0x02,0x4b,0x80,0xbe]
 0x02,0x4b,0x80,0xbe
+# GFX1250: s_add_pc_i64 s[2:3]                     ; encoding: [0x02,0x4b,0x80,0xbe]
 
-# GFX1250: s_sendmsg_rtn_b32 s2, sendmsg(MSG_RTN_GET_CLUSTER_BARRIER_STATE) ; encoding: [0x88,0x4c,0x82,0xbe]
 0x88,0x4c,0x82,0xbe
+# GFX1250: s_sendmsg_rtn_b32 s2, sendmsg(MSG_RTN_GET_CLUSTER_BARRIER_STATE) ; encoding: [0x88,0x4c,0x82,0xbe]
 
-# GFX1250: s_sendmsg_rtn_b64 s[2:3], sendmsg(MSG_RTN_GET_CLUSTER_BARRIER_STATE) ; encoding: [0x88,0x4d,0x82,0xbe]
 0x88,0x4d,0x82,0xbe
+# GFX1250: s_sendmsg_rtn_b64 s[2:3], sendmsg(MSG_RTN_GET_CLUSTER_BARRIER_STATE) ; encoding: [0x88,0x4d,0x82,0xbe]
 
-# GFX1250: s_get_shader_cycles_u64 s[2:3]          ; encoding: [0x00,0x06,0x82,0xbe]
 0x00,0x06,0x82,0xbe
+# GFX1250: s_get_shader_cycles_u64 s[2:3]          ; encoding: [0x00,0x06,0x82,0xbe]
 
-# GFX1250: s_barrier_signal -3                     ; encoding: [0xc3,0x4e,0x80,0xbe]
 0xc3,0x4e,0x80,0xbe
+# GFX1250: s_barrier_signal -3                     ; encoding: [0xc3,0x4e,0x80,0xbe]
 
-# GFX1250: s_get_barrier_state s3, -3               ; encoding: [0xc3,0x50,0x83,0xbe]
 0xc3,0x50,0x83,0xbe
+# GFX1250: s_get_barrier_state s3, -3              ; encoding: [0xc3,0x50,0x83,0xbe]
 
-# GFX1250: s_get_barrier_state s3, -4               ; encoding: [0xc4,0x50,0x83,0xbe]
 0xc4,0x50,0x83,0xbe
+# GFX1250: s_get_barrier_state s3, -4              ; encoding: [0xc4,0x50,0x83,0xbe]
 
-# GFX1250: s_get_barrier_state s3, m0               ; encoding: [0x7d,0x50,0x83,0xbe]
 0x7d,0x50,0x83,0xbe
+# GFX1250: s_get_barrier_state s3, m0              ; encoding: [0x7d,0x50,0x83,0xbe]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_valu_lit64.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_valu_lit64.txt
index 30650b4..1571fb9 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_valu_lit64.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_valu_lit64.txt
@@ -1,232 +1,233 @@
+# NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5
 # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX1250 %s
 
-# GFX1250: v_add_f64_e32 v[254:255], lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0xfd,0x05,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0xfe,0xfc,0xfd,0x05,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: v_add_f64_e32 v[254:255], lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0xfd,0x05,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: v_ceil_f64_e32 v[254:255], lit64(0x10abcdef12345678) ; encoding: [0xfe,0x30,0xfc,0x7f,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0xfe,0x30,0xfc,0x7f,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: v_ceil_f64_e32 v[254:255], lit64(0x10abcdef12345678) ; encoding: [0xfe,0x30,0xfc,0x7f,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: v_cmp_class_f64_e32 vcc_lo, lit64(0x10abcdef12345678), v255 ; encoding: [0xfe,0xfe,0xff,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0xfe,0xfe,0xff,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: v_cmp_class_f64_e32 vcc_lo, lit64(0x10abcdef12345678), v255 ; encoding: [0xfe,0xfe,0xff,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: v_cmp_eq_f64_e32 vcc_lo, lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x45,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0xfe,0xfc,0x45,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: v_cmp_eq_f64_e32 vcc_lo, lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x45,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: v_cmp_ge_f64_e32 vcc_lo, lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x4d,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0xfe,0xfc,0x4d,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: v_cmp_ge_f64_e32 vcc_lo, lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x4d,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: v_cmp_gt_f64_e32 vcc_lo, lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x49,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0xfe,0xfc,0x49,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: v_cmp_gt_f64_e32 vcc_lo, lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x49,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: v_cmp_gt_i64_e32 vcc_lo, lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0xa9,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0xfe,0xfc,0xa9,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: v_cmp_gt_i64_e32 vcc_lo, lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0xa9,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: v_cmp_gt_u64_e32 vcc_lo, lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0xb9,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0xfe,0xfc,0xb9,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: v_cmp_gt_u64_e32 vcc_lo, lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0xb9,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: v_cmp_le_f64_e32 vcc_lo, lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x47,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0xfe,0xfc,0x47,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: v_cmp_le_f64_e32 vcc_lo, lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x47,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: v_cmp_le_i64_e32 vcc_lo, lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0xa7,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0xfe,0xfc,0xa7,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: v_cmp_le_i64_e32 vcc_lo, lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0xa7,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: v_cmp_le_u64_e32 vcc_lo, lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0xb7,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0xfe,0xfc,0xb7,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: v_cmp_le_u64_e32 vcc_lo, lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0xb7,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: v_cmp_lg_f64_e32 vcc_lo, lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x4b,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0xfe,0xfc,0x4b,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: v_cmp_lg_f64_e32 vcc_lo, lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x4b,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: v_cmp_lt_f64_e32 vcc_lo, lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x43,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0xfe,0xfc,0x43,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: v_cmp_lt_f64_e32 vcc_lo, lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x43,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: v_cmp_lt_i64_e32 vcc_lo, lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0xa3,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0xfe,0xfc,0xa3,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: v_cmp_lt_i64_e32 vcc_lo, lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0xa3,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: v_cmp_lt_u64_e32 vcc_lo, lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0xb3,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0xfe,0xfc,0xb3,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: v_cmp_lt_u64_e32 vcc_lo, lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0xb3,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: v_cmp_ne_i64_e32 vcc_lo, lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0xab,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0xfe,0xfc,0xab,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: v_cmp_ne_i64_e32 vcc_lo, lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0xab,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: v_cmp_ne_u64_e32 vcc_lo, lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0xbb,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0xfe,0xfc,0xbb,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: v_cmp_ne_u64_e32 vcc_lo, lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0xbb,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: v_cmp_neq_f64_e32 vcc_lo, lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x5b,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0xfe,0xfc,0x5b,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: v_cmp_neq_f64_e32 vcc_lo, lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x5b,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: v_cmp_nge_f64_e32 vcc_lo, lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x53,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0xfe,0xfc,0x53,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: v_cmp_nge_f64_e32 vcc_lo, lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x53,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: v_cmp_ngt_f64_e32 vcc_lo, lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x57,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0xfe,0xfc,0x57,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: v_cmp_ngt_f64_e32 vcc_lo, lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x57,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: v_cmp_nle_f64_e32 vcc_lo, lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x59,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0xfe,0xfc,0x59,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: v_cmp_nle_f64_e32 vcc_lo, lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x59,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: v_cmp_nlg_f64_e32 vcc_lo, lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x55,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0xfe,0xfc,0x55,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: v_cmp_nlg_f64_e32 vcc_lo, lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x55,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: v_cmp_nlt_f64_e32 vcc_lo, lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x5d,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0xfe,0xfc,0x5d,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: v_cmp_nlt_f64_e32 vcc_lo, lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x5d,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: v_cmp_o_f64_e32 vcc_lo, lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x4f,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0xfe,0xfc,0x4f,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: v_cmp_o_f64_e32 vcc_lo, lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x4f,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: v_cmp_u_f64_e32 vcc_lo, lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x51,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0xfe,0xfc,0x51,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: v_cmp_u_f64_e32 vcc_lo, lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x51,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: v_cmpx_class_f64_e32 lit64(0x10abcdef12345678), v255 ; encoding: [0xfe,0xfe,0xff,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0xfe,0xfe,0xff,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: v_cmpx_class_f64_e32 lit64(0x10abcdef12345678), v255 ; encoding: [0xfe,0xfe,0xff,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: v_cmpx_eq_f64_e32 lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x45,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0xfe,0xfc,0x45,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: v_cmpx_eq_f64_e32 lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x45,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: v_cmpx_eq_i64_e32 lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0xa5,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0xfe,0xfc,0xa5,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: v_cmpx_eq_i64_e32 lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0xa5,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: v_cmpx_eq_u64_e32 lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0xb5,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0xfe,0xfc,0xb5,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: v_cmpx_eq_u64_e32 lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0xb5,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: v_cmpx_ge_f64_e32 lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x4d,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0xfe,0xfc,0x4d,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: v_cmpx_ge_f64_e32 lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x4d,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: v_cmpx_ge_i64_e32 lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0xad,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0xfe,0xfc,0xad,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: v_cmpx_ge_i64_e32 lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0xad,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: v_cmpx_ge_u64_e32 lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0xbd,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0xfe,0xfc,0xbd,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: v_cmpx_ge_u64_e32 lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0xbd,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: v_cmpx_gt_f64_e32 lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x49,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0xfe,0xfc,0x49,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: v_cmpx_gt_f64_e32 lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x49,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: v_cmpx_gt_i64_e32 lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0xa9,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0xfe,0xfc,0xa9,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: v_cmpx_gt_i64_e32 lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0xa9,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: v_cmpx_gt_u64_e32 lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0xb9,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0xfe,0xfc,0xb9,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: v_cmpx_gt_u64_e32 lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0xb9,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: v_cmpx_le_f64_e32 lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x47,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0xfe,0xfc,0x47,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: v_cmpx_le_f64_e32 lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x47,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: v_cmpx_le_i64_e32 lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0xa7,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0xfe,0xfc,0xa7,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: v_cmpx_le_i64_e32 lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0xa7,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: v_cmpx_le_u64_e32 lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0xb7,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0xfe,0xfc,0xb7,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: v_cmpx_le_u64_e32 lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0xb7,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: v_cmpx_lg_f64_e32 lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x4b,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0xfe,0xfc,0x4b,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: v_cmpx_lg_f64_e32 lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x4b,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: v_cmpx_lt_f64_e32 lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x43,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0xfe,0xfc,0x43,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: v_cmpx_lt_f64_e32 lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x43,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: v_cmpx_lt_i64_e32 lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0xa3,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0xfe,0xfc,0xa3,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: v_cmpx_lt_i64_e32 lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0xa3,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: v_cmpx_lt_u64_e32 lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0xb3,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0xfe,0xfc,0xb3,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: v_cmpx_lt_u64_e32 lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0xb3,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: v_cmpx_ne_i64_e32 lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0xab,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0xfe,0xfc,0xab,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: v_cmpx_ne_i64_e32 lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0xab,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: v_cmpx_ne_u64_e32 lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0xbb,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0xfe,0xfc,0xbb,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: v_cmpx_ne_u64_e32 lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0xbb,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: v_cmpx_neq_f64_e32 lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x5b,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0xfe,0xfc,0x5b,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: v_cmpx_neq_f64_e32 lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x5b,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: v_cmpx_nge_f64_e32 lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x53,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0xfe,0xfc,0x53,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: v_cmpx_nge_f64_e32 lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x53,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: v_cmpx_ngt_f64_e32 lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x57,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0xfe,0xfc,0x57,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: v_cmpx_ngt_f64_e32 lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x57,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: v_cmpx_nle_f64_e32 lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x59,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0xfe,0xfc,0x59,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: v_cmpx_nle_f64_e32 lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x59,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: v_cmpx_nlg_f64_e32 lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x55,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0xfe,0xfc,0x55,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: v_cmpx_nlg_f64_e32 lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x55,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: v_cmpx_nlt_f64_e32 lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x5d,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0xfe,0xfc,0x5d,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: v_cmpx_nlt_f64_e32 lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x5d,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: v_cmpx_o_f64_e32 lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x4f,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0xfe,0xfc,0x4f,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: v_cmpx_o_f64_e32 lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x4f,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: v_cmpx_u_f64_e32 lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x51,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0xfe,0xfc,0x51,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: v_cmpx_u_f64_e32 lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x51,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: v_cvt_f32_f64_e32 v255, lit64(0x10abcdef12345678) ; encoding: [0xfe,0x1e,0xfe,0x7f,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0xfe,0x1e,0xfe,0x7f,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: v_cvt_f32_f64_e32 v255, lit64(0x10abcdef12345678) ; encoding: [0xfe,0x1e,0xfe,0x7f,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: v_cvt_i32_f64_e32 v255, lit64(0x10abcdef12345678) ; encoding: [0xfe,0x06,0xfe,0x7f,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0xfe,0x06,0xfe,0x7f,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: v_cvt_i32_f64_e32 v255, lit64(0x10abcdef12345678) ; encoding: [0xfe,0x06,0xfe,0x7f,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: v_cvt_u32_f64_e32 v255, lit64(0x10abcdef12345678) ; encoding: [0xfe,0x2a,0xfe,0x7f,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0xfe,0x2a,0xfe,0x7f,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: v_cvt_u32_f64_e32 v255, lit64(0x10abcdef12345678) ; encoding: [0xfe,0x2a,0xfe,0x7f,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: v_floor_f64_e32 v[254:255], lit64(0x10abcdef12345678) ; encoding: [0xfe,0x34,0xfc,0x7f,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0xfe,0x34,0xfc,0x7f,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: v_floor_f64_e32 v[254:255], lit64(0x10abcdef12345678) ; encoding: [0xfe,0x34,0xfc,0x7f,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: v_fract_f64_e32 v[254:255], lit64(0x10abcdef12345678) ; encoding: [0xfe,0x7c,0xfc,0x7f,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0xfe,0x7c,0xfc,0x7f,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: v_fract_f64_e32 v[254:255], lit64(0x10abcdef12345678) ; encoding: [0xfe,0x7c,0xfc,0x7f,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: v_frexp_exp_i32_f64_e32 v255, lit64(0x10abcdef12345678) ; encoding: [0xfe,0x78,0xfe,0x7f,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0xfe,0x78,0xfe,0x7f,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: v_frexp_exp_i32_f64_e32 v255, lit64(0x10abcdef12345678) ; encoding: [0xfe,0x78,0xfe,0x7f,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: v_frexp_mant_f64_e32 v[254:255], lit64(0x10abcdef12345678) ; encoding: [0xfe,0x7a,0xfc,0x7f,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0xfe,0x7a,0xfc,0x7f,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: v_frexp_mant_f64_e32 v[254:255], lit64(0x10abcdef12345678) ; encoding: [0xfe,0x7a,0xfc,0x7f,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: v_max_num_f64_e32 v[254:255], lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0xfd,0x1d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0xfe,0xfc,0xfd,0x1d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: v_max_num_f64_e32 v[254:255], lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0xfd,0x1d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: v_min_num_f64_e32 v[254:255], lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0xfd,0x1b,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0xfe,0xfc,0xfd,0x1b,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: v_min_num_f64_e32 v[254:255], lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0xfd,0x1b,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: v_mul_f64_e32 v[254:255], lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0xfd,0x0d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0xfe,0xfc,0xfd,0x0d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: v_mul_f64_e32 v[254:255], lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0xfd,0x0d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: v_rcp_f64_e32 v[254:255], lit64(0x10abcdef12345678) ; encoding: [0xfe,0x5e,0xfc,0x7f,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0xfe,0x5e,0xfc,0x7f,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: v_rcp_f64_e32 v[254:255], lit64(0x10abcdef12345678) ; encoding: [0xfe,0x5e,0xfc,0x7f,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: v_rndne_f64_e32 v[254:255], lit64(0x10abcdef12345678) ; encoding: [0xfe,0x32,0xfc,0x7f,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0xfe,0x32,0xfc,0x7f,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: v_rndne_f64_e32 v[254:255], lit64(0x10abcdef12345678) ; encoding: [0xfe,0x32,0xfc,0x7f,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: v_rsq_f64_e32 v[254:255], lit64(0x10abcdef12345678) ; encoding: [0xfe,0x62,0xfc,0x7f,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0xfe,0x62,0xfc,0x7f,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: v_rsq_f64_e32 v[254:255], lit64(0x10abcdef12345678) ; encoding: [0xfe,0x62,0xfc,0x7f,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: v_sqrt_f64_e32 v[254:255], lit64(0x10abcdef12345678) ; encoding: [0xfe,0x68,0xfc,0x7f,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0xfe,0x68,0xfc,0x7f,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: v_sqrt_f64_e32 v[254:255], lit64(0x10abcdef12345678) ; encoding: [0xfe,0x68,0xfc,0x7f,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: v_trunc_f64_e32 v[254:255], lit64(0x10abcdef12345678) ; encoding: [0xfe,0x2e,0xfc,0x7f,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0xfe,0x2e,0xfc,0x7f,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: v_trunc_f64_e32 v[254:255], lit64(0x10abcdef12345678) ; encoding: [0xfe,0x2e,0xfc,0x7f,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: v_ceil_f64_e32 v[254:255], lit64(0x4063233333333333) ; encoding: [0xfe,0x30,0xfc,0x7f,0x33,0x33,0x33,0x33,0x33,0x23,0x63,0x40]
 0xfe,0x30,0xfc,0x7f,0x33,0x33,0x33,0x33,0x33,0x23,0x63,0x40
+# GFX1250: v_ceil_f64_e32 v[254:255], lit64(0x4063233333333333) ; encoding: [0xfe,0x30,0xfc,0x7f,0x33,0x33,0x33,0x33,0x33,0x23,0x63,0x40]
 
-# GFX1250: v_ceil_f64_e32 v[254:255], lit64(0x448969368974c05b) ; encoding: [0xfe,0x30,0xfc,0x7f,0x5b,0xc0,0x74,0x89,0x36,0x69,0x89,0x44]
 0xfe,0x30,0xfc,0x7f,0x5b,0xc0,0x74,0x89,0x36,0x69,0x89,0x44
+# GFX1250: v_ceil_f64_e32 v[254:255], lit64(0x448969368974c05b) ; encoding: [0xfe,0x30,0xfc,0x7f,0x5b,0xc0,0x74,0x89,0x36,0x69,0x89,0x44]
 
-# GFX1250: v_ceil_f64_e32 v[254:255], 0x40632000   ; encoding: [0xff,0x30,0xfc,0x7f,0x00,0x20,0x63,0x40]
 0xff,0x30,0xfc,0x7f,0x00,0x20,0x63,0x40
+# GFX1250: v_ceil_f64_e32 v[254:255], 0x40632000   ; encoding: [0xff,0x30,0xfc,0x7f,0x00,0x20,0x63,0x40]
 
-# GFX1250: v_mov_b64_e32 v[0:1], 0x12345678        ; encoding: [0xff,0x3a,0x00,0x7e,0x78,0x56,0x34,0x12]
 0xff,0x3a,0x00,0x7e,0x78,0x56,0x34,0x12
+# GFX1250: v_mov_b64_e32 v[0:1], 0x12345678        ; encoding: [0xff,0x3a,0x00,0x7e,0x78,0x56,0x34,0x12]
 
-# GFX1250: v_ceil_f64_e32 v[254:255], 0.15915494309189532 ; encoding: [0xf8,0x30,0xfc,0x7f]
 0xf8,0x30,0xfc,0x7f
+# GFX1250: v_ceil_f64_e32 v[254:255], 0.15915494309189532 ; encoding: [0xf8,0x30,0xfc,0x7f]
 
-# GFX1250: v_ceil_f64_e32 v[254:255], -4.0         ; encoding: [0xf7,0x30,0xfc,0x7f]
 0xf7,0x30,0xfc,0x7f
+# GFX1250: v_ceil_f64_e32 v[254:255], -4.0         ; encoding: [0xf7,0x30,0xfc,0x7f]
 
-# GFX1250: v_ceil_f64_e32 v[254:255], 2.0          ; encoding: [0xf4,0x30,0xfc,0x7f]
 0xf4,0x30,0xfc,0x7f
+# GFX1250: v_ceil_f64_e32 v[254:255], 2.0          ; encoding: [0xf4,0x30,0xfc,0x7f]
 
-# GFX1250: v_ceil_f64_e32 v[254:255], 0            ; encoding: [0x80,0x30,0xfc,0x7f]
 0x80,0x30,0xfc,0x7f
+# GFX1250: v_ceil_f64_e32 v[254:255], 0            ; encoding: [0x80,0x30,0xfc,0x7f]
 
-# GFX1250: v_ceil_f64_e32 v[254:255], lit64(0x7b)  ; encoding: [0xfe,0x30,0xfc,0x7f,0x7b,0x00,0x00,0x00,0x00,0x00,0x00,0x00]
 0xfe,0x30,0xfc,0x7f,0x7b,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+# GFX1250: v_ceil_f64_e32 v[254:255], lit64(0x7b)  ; encoding: [0xfe,0x30,0xfc,0x7f,0x7b,0x00,0x00,0x00,0x00,0x00,0x00,0x00]
 
-# GFX1250: v_ceil_f64_e32 v[254:255], lit64(0x109a) ; encoding: [0xfe,0x30,0xfc,0x7f,0x9a,0x10,0x00,0x00,0x00,0x00,0x00,0x00]
 0xfe,0x30,0xfc,0x7f,0x9a,0x10,0x00,0x00,0x00,0x00,0x00,0x00
+# GFX1250: v_ceil_f64_e32 v[254:255], lit64(0x109a) ; encoding: [0xfe,0x30,0xfc,0x7f,0x9a,0x10,0x00,0x00,0x00,0x00,0x00,0x00]
diff --git a/llvm/test/MachineVerifier/test_g_build_vector.mir b/llvm/test/MachineVerifier/test_g_build_vector.mir
index 50b9801..9857306 100644
--- a/llvm/test/MachineVerifier/test_g_build_vector.mir
+++ b/llvm/test/MachineVerifier/test_g_build_vector.mir
@@ -16,17 +16,17 @@ body:             |
     ; CHECK: Bad machine code: G_BUILD_VECTOR must produce a vector from scalar operands
     %3:_(<2 x s32>) = G_BUILD_VECTOR %2
 
-    ; CHECK: Bad machine code: G_BUILD_VECTOR must have an operand for each elemement
+    ; CHECK: Bad machine code: G_BUILD_VECTOR must have an operand for each element
     %4:_(<2 x s32>) = G_BUILD_VECTOR %0, %0, %0, %0
 
     ; CHECK: Bad machine code: G_BUILD_VECTOR result element type must match source type
-    ; CHECK: Bad machine code: G_BUILD_VECTOR must have an operand for each elemement
+    ; CHECK: Bad machine code: G_BUILD_VECTOR must have an operand for each element
     %5:_(<4 x s16>) = G_BUILD_VECTOR %0, %0
 
     %6:_(s16) = IMPLICIT_DEF
 
     ; CHECK: Bad machine code: G_BUILD_VECTOR result element type must match source type
-    ; CHECK: Bad machine code: G_BUILD_VECTOR must have an operand for each elemement
+    ; CHECK: Bad machine code: G_BUILD_VECTOR must have an operand for each element
     %7:_(<2 x s32>) = G_BUILD_VECTOR %6, %6, %6, %6
 
     %8:_(p0) = IMPLICIT_DEF
diff --git a/llvm/test/Transforms/CorrelatedValuePropagation/track-predecessor-ranges.ll b/llvm/test/Transforms/CorrelatedValuePropagation/track-predecessor-ranges.ll
new file mode 100644
index 0000000..b5f68842
--- /dev/null
+++ b/llvm/test/Transforms/CorrelatedValuePropagation/track-predecessor-ranges.ll
@@ -0,0 +1,98 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -passes="correlated-propagation"  -S 2>&1 | FileCheck %s
+; RUN: opt < %s -passes="correlated-propagation" -lvi-per-pred-ranges -S 2>&1 | FileCheck %s -check-prefix=LVI-PRED-RANGES
+
+@global = external local_unnamed_addr global [4338 x i32], align 16
+
+define dso_local noundef zeroext i1 @bar(i64 noundef %arg, ptr noundef writeonly captures(none) %arg1) local_unnamed_addr {
+; CHECK-LABEL: define dso_local noundef zeroext i1 @bar(
+; CHECK-SAME: i64 noundef [[ARG:%.*]], ptr noundef writeonly captures(none) [[ARG1:%.*]]) local_unnamed_addr {
+; CHECK-NEXT:  [[BB:.*]]:
+; CHECK-NEXT:    [[ICMP:%.*]] = icmp ult i64 [[ARG]], 1025
+; CHECK-NEXT:    br i1 [[ICMP]], label %[[BB4:.*]], label %[[BB2:.*]]
+; CHECK:       [[BB2]]:
+; CHECK-NEXT:    [[ICMP3:%.*]] = icmp ult i64 [[ARG]], 262145
+; CHECK-NEXT:    br i1 [[ICMP3]], label %[[BB4]], label %[[BB9:.*]]
+; CHECK:       [[BB4]]:
+; CHECK-NEXT:    [[PHI:%.*]] = phi i64 [ 7, %[[BB]] ], [ 15487, %[[BB2]] ]
+; CHECK-NEXT:    [[PHI5:%.*]] = phi i64 [ 3, %[[BB]] ], [ 7, %[[BB2]] ]
+; CHECK-NEXT:    [[ADD:%.*]] = add nuw nsw i64 [[PHI]], [[ARG]]
+; CHECK-NEXT:    [[LSHR:%.*]] = lshr i64 [[ADD]], [[PHI5]]
+; CHECK-NEXT:    [[ICMP6:%.*]] = icmp samesign ult i64 [[LSHR]], 4338
+; CHECK-NEXT:    br i1 [[ICMP6]], label %[[BB8:.*]], label %[[BB7:.*]]
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    tail call void @llvm.ubsantrap(i8 18)
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    [[GETELEMENTPTR:%.*]] = getelementptr inbounds nuw [4338 x i32], ptr @global, i64 0, i64 [[LSHR]]
+; CHECK-NEXT:    [[LOAD:%.*]] = load i32, ptr [[GETELEMENTPTR]], align 4
+; CHECK-NEXT:    [[SEXT:%.*]] = sext i32 [[LOAD]] to i64
+; CHECK-NEXT:    store i64 [[SEXT]], ptr [[ARG1]], align 8
+; CHECK-NEXT:    br label %[[BB9]]
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    [[PHI10:%.*]] = phi i1 [ true, %[[BB8]] ], [ false, %[[BB2]] ]
+; CHECK-NEXT:    ret i1 [[PHI10]]
+;
+; LVI-PRED-RANGES-LABEL: define dso_local noundef zeroext i1 @bar(
+; LVI-PRED-RANGES-SAME: i64 noundef [[ARG:%.*]], ptr noundef writeonly captures(none) [[ARG1:%.*]]) local_unnamed_addr {
+; LVI-PRED-RANGES-NEXT:  [[BB:.*]]:
+; LVI-PRED-RANGES-NEXT:    [[ICMP:%.*]] = icmp ult i64 [[ARG]], 1025
+; LVI-PRED-RANGES-NEXT:    br i1 [[ICMP]], label %[[BB4:.*]], label %[[BB2:.*]]
+; LVI-PRED-RANGES:       [[BB2]]:
+; LVI-PRED-RANGES-NEXT:    [[ICMP3:%.*]] = icmp ult i64 [[ARG]], 262145
+; LVI-PRED-RANGES-NEXT:    br i1 [[ICMP3]], label %[[BB4]], label %[[BB9:.*]]
+; LVI-PRED-RANGES:       [[BB4]]:
+; LVI-PRED-RANGES-NEXT:    [[PHI:%.*]] = phi i64 [ 7, %[[BB]] ], [ 15487, %[[BB2]] ]
+; LVI-PRED-RANGES-NEXT:    [[PHI5:%.*]] = phi i64 [ 3, %[[BB]] ], [ 7, %[[BB2]] ]
+; LVI-PRED-RANGES-NEXT:    [[ADD:%.*]] = add nuw nsw i64 [[PHI]], [[ARG]]
+; LVI-PRED-RANGES-NEXT:    [[LSHR:%.*]] = lshr i64 [[ADD]], [[PHI5]]
+; LVI-PRED-RANGES-NEXT:    br i1 true, label %[[BB8:.*]], label %[[BB7:.*]]
+; LVI-PRED-RANGES:       [[BB7]]:
+; LVI-PRED-RANGES-NEXT:    tail call void @llvm.ubsantrap(i8 18)
+; LVI-PRED-RANGES-NEXT:    unreachable
+; LVI-PRED-RANGES:       [[BB8]]:
+; LVI-PRED-RANGES-NEXT:    [[GETELEMENTPTR:%.*]] = getelementptr inbounds nuw [4338 x i32], ptr @global, i64 0, i64 [[LSHR]]
+; LVI-PRED-RANGES-NEXT:    [[LOAD:%.*]] = load i32, ptr [[GETELEMENTPTR]], align 4
+; LVI-PRED-RANGES-NEXT:    [[SEXT:%.*]] = sext i32 [[LOAD]] to i64
+; LVI-PRED-RANGES-NEXT:    store i64 [[SEXT]], ptr [[ARG1]], align 8
+; LVI-PRED-RANGES-NEXT:    br label %[[BB9]]
+; LVI-PRED-RANGES:       [[BB9]]:
+; LVI-PRED-RANGES-NEXT:    [[PHI10:%.*]] = phi i1 [ true, %[[BB8]] ], [ false, %[[BB2]] ]
+; LVI-PRED-RANGES-NEXT:    ret i1 [[PHI10]]
+;
+bb:
+  %icmp = icmp ult i64 %arg, 1025
+  br i1 %icmp, label %bb4, label %bb2
+
+bb2:                                              ; preds = %bb
+  %icmp3 = icmp ult i64 %arg, 262145
+  br i1 %icmp3, label %bb4, label %bb9
+
+bb4:                                              ; preds = %bb2, %bb
+  %phi = phi i64 [ 7, %bb ], [ 15487, %bb2 ]
+  %phi5 = phi i64 [ 3, %bb ], [ 7, %bb2 ]
+  %add = add nuw nsw i64 %phi, %arg
+  %lshr = lshr i64 %add, %phi5
+  %icmp6 = icmp samesign ult i64 %lshr, 4338
+  br i1 %icmp6, label %bb8, label %bb7
+
+bb7:                                              ; preds = %bb4
+  tail call void @llvm.ubsantrap(i8 18)
+  unreachable
+
+bb8:                                              ; preds = %bb4
+  %getelementptr = getelementptr inbounds nuw [4338 x i32], ptr @global, i64 0, i64 %lshr
+  %load = load i32, ptr %getelementptr, align 4
+  %sext = sext i32 %load to i64
+  store i64 %sext, ptr %arg1, align 8
+  br label %bb9
+
+bb9:                                              ; preds = %bb8, %bb2
+  %phi10 = phi i1 [ true, %bb8 ], [ false, %bb2 ]
+  ret i1 %phi10
+}
+
+; Function Attrs: cold noreturn nounwind
+declare void @llvm.ubsantrap(i8 immarg) #0
+
+attributes #0 = { cold noreturn nounwind }
diff --git a/llvm/test/Transforms/DropUnnecessaryAssumes/basic.ll b/llvm/test/Transforms/DropUnnecessaryAssumes/basic.ll
index e2a9b4e..8a6f60b 100644
--- a/llvm/test/Transforms/DropUnnecessaryAssumes/basic.ll
+++ b/llvm/test/Transforms/DropUnnecessaryAssumes/basic.ll
@@ -1,6 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
 ; RUN: opt -S -passes=drop-unnecessary-assumes < %s | FileCheck %s
 
+declare void @use(i32 %x)
+declare i32 @get()
+
 define void @basic_dead(i32 %x) {
 ; CHECK-LABEL: define void @basic_dead(
 ; CHECK-SAME: i32 [[X:%.*]]) {
@@ -180,3 +183,136 @@ define void @type_test(ptr %x) {
   call void @llvm.assume(i1 %test)
   ret void
 }
+
+define void @multiple_dead_conds(i32 %x) {
+; CHECK-LABEL: define void @multiple_dead_conds(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT:    ret void
+;
+  %cond1 = icmp sge i32 %x, 0
+  call void @llvm.assume(i1 %cond1)
+  %cond2 = icmp ne i32 %x, 64
+  call void @llvm.assume(i1 %cond2)
+  ret void
+}
+
+define void @multiple_dead_bundles(ptr %x) {
+; CHECK-LABEL: define void @multiple_dead_bundles(
+; CHECK-SAME: ptr [[X:%.*]]) {
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.assume(i1 true) ["align"(ptr %x, i64 8), "nonnull"(ptr %x)]
+  ret void
+}
+
+; The assume is eliminated, but currently leaves behind a dead cycle.
+define void @dead_cycle(i1 %loop.cond) {
+; CHECK-LABEL: define void @dead_cycle(
+; CHECK-SAME: i1 [[LOOP_COND:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
+; CHECK-NEXT:    br i1 [[LOOP_COND]], label %[[LOOP]], label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+  %cond = icmp ne i32 %iv, 64
+  call void @llvm.assume(i1 %cond)
+  %iv.next = add i32 %iv, 1
+  br i1 %loop.cond, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+define void @use_in_side_effect(i32 %x) {
+; CHECK-LABEL: define void @use_in_side_effect(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT:    [[COND:%.*]] = icmp sge i32 [[X]], 0
+; CHECK-NEXT:    call void @llvm.assume(i1 [[COND]])
+; CHECK-NEXT:    call void @use(i32 [[X]])
+; CHECK-NEXT:    ret void
+;
+  %cond = icmp sge i32 %x, 0
+  call void @llvm.assume(i1 %cond)
+  call void @use(i32 %x)
+  ret void
+}
+
+define void @indirect_use_in_side_effect(i32 %x) {
+; CHECK-LABEL: define void @indirect_use_in_side_effect(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT:    [[COND:%.*]] = icmp sge i32 [[X]], 0
+; CHECK-NEXT:    call void @llvm.assume(i1 [[COND]])
+; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[X]], 1
+; CHECK-NEXT:    call void @use(i32 [[ADD]])
+; CHECK-NEXT:    ret void
+;
+  %cond = icmp sge i32 %x, 0
+  call void @llvm.assume(i1 %cond)
+  %add = add i32 %x, 1
+  call void @use(i32 %add)
+  ret void
+}
+
+; The affected value itself has a side effect, but we can still drop the
+; assume.
+define void @affected_value_has_side_effect() {
+; CHECK-LABEL: define void @affected_value_has_side_effect() {
+; CHECK-NEXT:    [[X:%.*]] = call i32 @get()
+; CHECK-NEXT:    ret void
+;
+  %x = call i32 @get()
+  %cond = icmp sge i32 %x, 0
+  call void @llvm.assume(i1 %cond)
+  ret void
+}
+
+define i32 @affected_value_has_side_effect_and_is_used() {
+; CHECK-LABEL: define i32 @affected_value_has_side_effect_and_is_used() {
+; CHECK-NEXT:    [[X:%.*]] = call i32 @get()
+; CHECK-NEXT:    [[COND:%.*]] = icmp sge i32 [[X]], 0
+; CHECK-NEXT:    call void @llvm.assume(i1 [[COND]])
+; CHECK-NEXT:    ret i32 [[X]]
+;
+  %x = call i32 @get()
+  %cond = icmp sge i32 %x, 0
+  call void @llvm.assume(i1 %cond)
+  ret i32 %x
+}
+
+@g = external global i8
+@g2 = external global i8
+
+; Assumes on globals are currently not supported.
+define void @assume_on_global() {
+; CHECK-LABEL: define void @assume_on_global() {
+; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr @g, i64 8) ]
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.assume(i1 true) ["align"(ptr @g, i64 8)]
+  ret void
+}
+
+define void @assume_on_global_used_in_other_func() {
+; CHECK-LABEL: define void @assume_on_global_used_in_other_func() {
+; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr @g2, i64 8) ]
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.assume(i1 true) ["align"(ptr @g2, i64 8)]
+  ret void
+}
+
+define ptr @other_func() {
+; CHECK-LABEL: define ptr @other_func() {
+; CHECK-NEXT:    ret ptr @g2
+;
+  ret ptr @g2
+}
diff --git a/llvm/test/Transforms/InstCombine/preserve-profile.ll b/llvm/test/Transforms/InstCombine/preserve-profile.ll
index dd83805..8cb3e68 100644
--- a/llvm/test/Transforms/InstCombine/preserve-profile.ll
+++ b/llvm/test/Transforms/InstCombine/preserve-profile.ll
@@ -46,9 +46,59 @@ define i32 @NegBin(i1 %C) !prof !0 {
   ret i32 %V
 }
 
+define i32 @select_C_minus_1_or_C_from_bool(i1 %x) !prof !0 {
+; CHECK-LABEL: define i32 @select_C_minus_1_or_C_from_bool(
+; CHECK-SAME: i1 [[X:%.*]]) !prof [[PROF0]] {
+; CHECK-NEXT:    [[ADD:%.*]] = select i1 [[X]], i32 41, i32 42, !prof [[PROF2:![0-9]+]]
+; CHECK-NEXT:    ret i32 [[ADD]]
+;
+  %ext = sext i1 %x to i32
+  %add = add i32 %ext, 42
+  ret i32 %add
+}
+
+define i5 @and_add(i1 %x, i1 %y) !prof !0 {
+; CHECK-LABEL: define i5 @and_add(
+; CHECK-SAME: i1 [[X:%.*]], i1 [[Y:%.*]]) !prof [[PROF0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = xor i1 [[X]], true
+; CHECK-NEXT:    [[TMP2:%.*]] = and i1 [[Y]], [[TMP1]]
+; CHECK-NEXT:    [[R:%.*]] = select i1 [[TMP2]], i5 -2, i5 0, !prof [[PROF2]]
+; CHECK-NEXT:    ret i5 [[R]]
+;
+  %xz = zext i1 %x to i5
+  %ys = sext i1 %y to i5
+  %sub = add i5 %xz, %ys
+  %r = and i5 %sub, 30
+  ret i5 %r
+}
+
+define i32 @add_zext_zext_i1(i1 %a) !prof !0 {
+; CHECK-LABEL: define i32 @add_zext_zext_i1(
+; CHECK-SAME: i1 [[A:%.*]]) !prof [[PROF0]] {
+; CHECK-NEXT:    [[ADD:%.*]] = select i1 [[A]], i32 2, i32 0, !prof [[PROF2]]
+; CHECK-NEXT:    ret i32 [[ADD]]
+;
+  %zext = zext i1 %a to i32
+  %add = add i32 %zext, %zext
+  ret i32 %add
+}
+
+define i32 @no_count_no_branch_weights(i1 %a) {
+; CHECK-LABEL: define i32 @no_count_no_branch_weights(
+; CHECK-SAME: i1 [[A:%.*]]) {
+; CHECK-NEXT:    [[ADD:%.*]] = select i1 [[A]], i32 2, i32 0
+; CHECK-NEXT:    ret i32 [[ADD]]
+;
+  %zext = zext i1 %a to i32
+  %add = add i32 %zext, %zext
+  ret i32 %add
+}
+
+
 !0 = !{!"function_entry_count", i64 1000}
 !1 = !{!"branch_weights", i32 2, i32 3}
 ;.
 ; CHECK: [[PROF0]] = !{!"function_entry_count", i64 1000}
 ; CHECK: [[PROF1]] = !{!"branch_weights", i32 2, i32 3}
+; CHECK: [[PROF2]] = !{!"unknown", !"instcombine"}
 ;.
diff --git a/llvm/test/Transforms/InstCombine/ptrtoaddr.ll b/llvm/test/Transforms/InstCombine/ptrtoaddr.ll
new file mode 100644
index 0000000..61b1331
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/ptrtoaddr.ll
@@ -0,0 +1,65 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt < %s -passes=instcombine -S | FileCheck %s
+target datalayout = "p1:64:64:64:32"
+
+define i32 @ptrtoaddr_inttoptr_arg(i32 %a) {
+; CHECK-LABEL: define i32 @ptrtoaddr_inttoptr_arg(
+; CHECK-SAME: i32 [[A:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i32 [[A]] to i64
+; CHECK-NEXT:    [[TOPTR:%.*]] = inttoptr i64 [[TMP1]] to ptr addrspace(1)
+; CHECK-NEXT:    [[TOADDR:%.*]] = ptrtoaddr ptr addrspace(1) [[TOPTR]] to i32
+; CHECK-NEXT:    ret i32 [[TOADDR]]
+;
+  %toptr = inttoptr i32 %a to ptr addrspace(1)
+  %toaddr = ptrtoaddr ptr addrspace(1) %toptr to i32
+  ret i32 %toaddr
+}
+
+define i32 @ptrtoaddr_inttoptr() {
+; CHECK-LABEL: define i32 @ptrtoaddr_inttoptr() {
+; CHECK-NEXT:    ret i32 -1
+;
+  ret i32 ptrtoaddr (ptr addrspace(1) inttoptr (i32 -1 to ptr addrspace(1)) to i32)
+}
+
+define i32 @ptrtoaddr_inttoptr_diff_size1() {
+; CHECK-LABEL: define i32 @ptrtoaddr_inttoptr_diff_size1() {
+; CHECK-NEXT:    ret i32 ptrtoaddr (ptr addrspace(1) inttoptr (i64 -1 to ptr addrspace(1)) to i32)
+;
+  ret i32 ptrtoaddr (ptr addrspace(1) inttoptr (i64 -1 to ptr addrspace(1)) to i32)
+}
+
+define i32 @ptrtoaddr_inttoptr_diff_size2() {
+; CHECK-LABEL: define i32 @ptrtoaddr_inttoptr_diff_size2() {
+; CHECK-NEXT:    ret i32 ptrtoaddr (ptr addrspace(1) inttoptr (i16 -1 to ptr addrspace(1)) to i32)
+;
+  ret i32 ptrtoaddr (ptr addrspace(1) inttoptr (i16 -1 to ptr addrspace(1)) to i32)
+}
+
+define i64 @ptrtoaddr_inttoptr_noas1() {
+; CHECK-LABEL: define i64 @ptrtoaddr_inttoptr_noas1() {
+; CHECK-NEXT:    ret i64 1
+;
+  ret i64 ptrtoaddr (ptr getelementptr (i8, ptr null, i64 1) to i64)
+}
+
+define i64 @ptr2addr2_inttoptr_noas2() {
+; CHECK-LABEL: define i64 @ptr2addr2_inttoptr_noas2() {
+; CHECK-NEXT:    ret i64 123
+;
+  ret i64 ptrtoaddr (ptr inttoptr (i64 123 to ptr) to i64)
+}
+
+define i64 @ptrtoaddr_inttoptr_noas_diff_size1() {
+; CHECK-LABEL: define i64 @ptrtoaddr_inttoptr_noas_diff_size1() {
+; CHECK-NEXT:    ret i64 ptrtoaddr (ptr inttoptr (i32 -1 to ptr) to i64)
+;
+  ret i64 ptrtoaddr (ptr inttoptr (i32 -1 to ptr) to i64)
+}
+
+define i64 @ptrtoaddr_inttoptr_noas_diff_size2() {
+; CHECK-LABEL: define i64 @ptrtoaddr_inttoptr_noas_diff_size2() {
+; CHECK-NEXT:    ret i64 ptrtoaddr (ptr inttoptr (i128 -1 to ptr) to i64)
+;
+  ret i64 ptrtoaddr (ptr inttoptr (i128 -1 to ptr) to i64)
+}
diff --git a/llvm/test/Transforms/InstCombine/vector-reductions.ll b/llvm/test/Transforms/InstCombine/vector-reductions.ll
index 10f4aca..f1e0dd9 100644
--- a/llvm/test/Transforms/InstCombine/vector-reductions.ll
+++ b/llvm/test/Transforms/InstCombine/vector-reductions.ll
@@ -308,3 +308,174 @@ define i32 @diff_of_sums_type_mismatch2(<8 x i32> %v0, <4 x i32> %v1) {
   %r = sub i32 %r0, %r1
   ret i32 %r
 }
+
+define i32 @constant_multiplied_4xi32(i32 %0) {
+; CHECK-LABEL: @constant_multiplied_4xi32(
+; CHECK-NEXT:    [[TMP2:%.*]] = shl i32 [[TMP0:%.*]], 2
+; CHECK-NEXT:    ret i32 [[TMP2]]
+;
+  %2 = insertelement <4 x i32> poison, i32 %0, i64 0
+  %3 = shufflevector <4 x i32> %2, <4 x i32> poison, <4 x i32> zeroinitializer
+  %4 = tail call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %3)
+  ret i32 %4
+}
+
+define i32 @constant_multiplied_3xi32(i32 %0) {
+; CHECK-LABEL: @constant_multiplied_3xi32(
+; CHECK-NEXT:    [[TMP2:%.*]] = mul i32 [[TMP0:%.*]], 3
+; CHECK-NEXT:    ret i32 [[TMP2]]
+;
+  %2 = insertelement <3 x i32> poison, i32 %0, i64 0
+  %3 = shufflevector <3 x i32> %2, <3 x i32> poison, <3 x i32> zeroinitializer
+  %4 = tail call i32 @llvm.vector.reduce.add.v3i32(<3 x i32> %3)
+  ret i32 %4
+}
+
+define i64 @constant_multiplied_4xi64(i64 %0) {
+; CHECK-LABEL: @constant_multiplied_4xi64(
+; CHECK-NEXT:    [[TMP2:%.*]] = shl i64 [[TMP0:%.*]], 2
+; CHECK-NEXT:    ret i64 [[TMP2]]
+;
+  %2 = insertelement <4 x i64> poison, i64 %0, i64 0
+  %3 = shufflevector <4 x i64> %2, <4 x i64> poison, <4 x i32> zeroinitializer
+  %4 = tail call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %3)
+  ret i64 %4
+}
+
+define i32 @constant_multiplied_8xi32(i32 %0) {
+; CHECK-LABEL: @constant_multiplied_8xi32(
+; CHECK-NEXT:    [[TMP2:%.*]] = shl i32 [[TMP0:%.*]], 3
+; CHECK-NEXT:    ret i32 [[TMP2]]
+;
+  %2 = insertelement <4 x i32> poison, i32 %0, i64 0
+  %3 = shufflevector <4 x i32> %2, <4 x i32> poison, <8 x i32> zeroinitializer
+  %4 = tail call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %3)
+  ret i32 %4
+}
+
+
+define i32 @constant_multiplied_16xi32(i32 %0) {
+; CHECK-LABEL: @constant_multiplied_16xi32(
+; CHECK-NEXT:    [[TMP2:%.*]] = shl i32 [[TMP0:%.*]], 4
+; CHECK-NEXT:    ret i32 [[TMP2]]
+;
+  %2 = insertelement <4 x i32> poison, i32 %0, i64 0
+  %3 = shufflevector <4 x i32> %2, <4 x i32> poison, <16 x i32> zeroinitializer
+  %4 = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %3)
+  ret i32 %4
+}
+
+
+define i32 @constant_multiplied_4xi32_at_idx1(i32 %0) {
+; CHECK-LABEL: @constant_multiplied_4xi32_at_idx1(
+; CHECK-NEXT:    [[TMP2:%.*]] = shl i32 [[TMP0:%.*]], 2
+; CHECK-NEXT:    ret i32 [[TMP2]]
+;
+  %2 = insertelement <4 x i32> poison, i32 %0, i64 1
+  %3 = shufflevector <4 x i32> %2, <4 x i32> poison,
+  <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  %4 = tail call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %3)
+  ret i32 %4
+}
+
+define i32 @negative_constant_multiplied_4xi32(i32 %0) {
+; CHECK-LABEL: @negative_constant_multiplied_4xi32(
+; CHECK-NEXT:    ret i32 poison
+;
+  %2 = insertelement <4 x i32> poison, i32 %0, i64 1
+  %3 = shufflevector <4 x i32> %2, <4 x i32> poison, <4 x i32> zeroinitializer
+  %4 = tail call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %3)
+  ret i32 %4
+}
+
+define i32 @constant_multiplied_6xi32(i32 %0) {
+; CHECK-LABEL: @constant_multiplied_6xi32(
+; CHECK-NEXT:    [[TMP2:%.*]] = mul i32 [[TMP0:%.*]], 6
+; CHECK-NEXT:    ret i32 [[TMP2]]
+;
+  %2 = insertelement <4 x i32> poison, i32 %0, i64 0
+  %3 = shufflevector <4 x i32> %2, <4 x i32> poison, <6 x i32> zeroinitializer
+  %4 = tail call i32 @llvm.vector.reduce.add.v6i32(<6 x i32> %3)
+  ret i32 %4
+}
+
+define i64 @constant_multiplied_6xi64(i64 %0) {
+; CHECK-LABEL: @constant_multiplied_6xi64(
+; CHECK-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP0:%.*]], 6
+; CHECK-NEXT:    ret i64 [[TMP2]]
+;
+  %2 = insertelement <4 x i64> poison, i64 %0, i64 0
+  %3 = shufflevector <4 x i64> %2, <4 x i64> poison, <6 x i32> zeroinitializer
+  %4 = tail call i64 @llvm.vector.reduce.add.v6i64(<6 x i64> %3)
+  ret i64 %4
+}
+
+define i1 @constant_multiplied_8xi1(i1 %0) {
+; CHECK-LABEL: @constant_multiplied_8xi1(
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <8 x i1> poison, i1 [[TMP0:%.*]], i64 0
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i1> [[TMP3]] to i8
+; CHECK-NEXT:    [[TMP5:%.*]] = call range(i8 0, 9) i8 @llvm.ctpop.i8(i8 [[TMP4]])
+; CHECK-NEXT:    [[TMP6:%.*]] = trunc i8 [[TMP5]] to i1
+; CHECK-NEXT:    ret i1 [[TMP6]]
+;
+  %2 = insertelement <8 x i1> poison, i1 %0, i32 0
+  %3 = shufflevector <8 x i1> %2, <8 x i1> poison, <8 x i32> zeroinitializer
+  %4 = tail call i1 @llvm.vector.reduce.add.v8i1(<8 x i1> %3)
+  ret i1 %4
+}
+
+define i2 @constant_multiplied_4xi2(i2 %0) {
+; CHECK-LABEL: @constant_multiplied_4xi2(
+; CHECK-NEXT:    ret i2 0
+;
+  %2 = insertelement <4 x i2> poison, i2 %0, i32 0
+  %3 = shufflevector <4 x i2> %2, <4 x i2> poison, <4 x i32> zeroinitializer
+  %4 = tail call i2 @llvm.vector.reduce.add.v4i2(<4 x i2> %3)
+  ret i2 %4
+}
+
+define i2 @constant_multiplied_5xi2(i2 %0) {
+; CHECK-LABEL: @constant_multiplied_5xi2(
+; CHECK-NEXT:    ret i2 [[TMP0:%.*]]
+;
+  %2 = insertelement <5 x i2> poison, i2 %0, i64 0
+  %3 = shufflevector <5 x i2> %2, <5 x i2> poison, <5 x i32> zeroinitializer
+  %4 = tail call i2 @llvm.vector.reduce.add.v5i2(<5 x i2> %3)
+  ret i2 %4
+}
+
+define i2 @constant_multiplied_6xi2(i2 %0) {
+; CHECK-LABEL: @constant_multiplied_6xi2(
+; CHECK-NEXT:    [[TMP2:%.*]] = shl i2 [[TMP0:%.*]], 1
+; CHECK-NEXT:    ret i2 [[TMP2]]
+;
+  %2 = insertelement <6 x i2> poison, i2 %0, i64 0
+  %3 = shufflevector <6 x i2> %2, <6 x i2> poison, <6 x i32> zeroinitializer
+  %4 = tail call i2 @llvm.vector.reduce.add.v6i2(<6 x i2> %3)
+  ret i2 %4
+}
+
+define i2 @constant_multiplied_7xi2(i2 %0) {
+; CHECK-LABEL: @constant_multiplied_7xi2(
+; CHECK-NEXT:    [[TMP2:%.*]] = sub i2 0, [[TMP0:%.*]]
+; CHECK-NEXT:    ret i2 [[TMP2]]
+;
+  %2 = insertelement <7 x i2> poison, i2 %0, i64 0
+  %3 = shufflevector <7 x i2> %2, <7 x i2> poison, <7 x i32> zeroinitializer
+  %4 = tail call i2 @llvm.vector.reduce.add.v7i2(<7 x i2> %3)
+  ret i2 %4
+}
+
+define i32 @negative_scalable_vector(i32 %0) {
+; CHECK-LABEL: @negative_scalable_vector(
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP0:%.*]], i64 0
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = tail call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP3]])
+; CHECK-NEXT:    ret i32 [[TMP4]]
+;
+  %2 = insertelement <vscale x 4 x i32> poison, i32 %0, i64 0
+  %3 = shufflevector <vscale x 4 x i32> %2, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+  %4 = tail call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> %3)
+  ret i32 %4
+}
diff --git a/llvm/test/Transforms/LoopVectorize/dereferenceable-info-from-assumption-constant-size.ll b/llvm/test/Transforms/LoopVectorize/dereferenceable-info-from-assumption-constant-size.ll
index b3338f47..75420d4 100644
--- a/llvm/test/Transforms/LoopVectorize/dereferenceable-info-from-assumption-constant-size.ll
+++ b/llvm/test/Transforms/LoopVectorize/dereferenceable-info-from-assumption-constant-size.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --filter-out-after "scalar.ph:" --version 6
 ; RUN: opt -p loop-vectorize -force-vector-width=2 -S %s | FileCheck %s
 
 declare void @llvm.assume(i1)
@@ -47,29 +47,8 @@ define void @deref_assumption_in_header_constant_trip_count(ptr noalias noundef
 ; CHECK-NEXT:    [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
 ; CHECK-NEXT:    br i1 [[TMP32]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br label %[[EXIT:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
-; CHECK:       [[LOOP_HEADER]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
-; CHECK-NEXT:    [[GEP_A:%.*]] = getelementptr i32, ptr [[A]], i64 [[IV]]
-; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[GEP_A]], i64 4), "dereferenceable"(ptr [[GEP_A]], i64 4) ]
-; CHECK-NEXT:    [[GEP_B:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]]
-; CHECK-NEXT:    [[L_B:%.*]] = load i32, ptr [[GEP_B]], align 4
-; CHECK-NEXT:    [[C_1:%.*]] = icmp sge i32 [[L_B]], 0
-; CHECK-NEXT:    br i1 [[C_1]], label %[[LOOP_LATCH]], label %[[LOOP_THEN:.*]]
-; CHECK:       [[LOOP_THEN]]:
-; CHECK-NEXT:    [[L_A:%.*]] = load i32, ptr [[GEP_A]], align 4
-; CHECK-NEXT:    br label %[[LOOP_LATCH]]
-; CHECK:       [[LOOP_LATCH]]:
-; CHECK-NEXT:    [[MERGE:%.*]] = phi i32 [ [[L_A]], %[[LOOP_THEN]] ], [ [[L_B]], %[[LOOP_HEADER]] ]
-; CHECK-NEXT:    [[GEP_C:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[IV]]
-; CHECK-NEXT:    store i32 [[MERGE]], ptr [[GEP_C]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]]
-; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    br [[EXIT:label %.*]]
+; CHECK:       [[SCALAR_PH:.*:]]
 ;
 entry:
   br label %loop.header
@@ -123,27 +102,8 @@ define void @align_deref_assumption_in_header_constant_trip_count_loop_invariant
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
 ; CHECK-NEXT:    br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br label %[[EXIT:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
-; CHECK:       [[LOOP_HEADER]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
-; CHECK-NEXT:    [[GEP_B:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]]
-; CHECK-NEXT:    [[L_B:%.*]] = load i32, ptr [[GEP_B]], align 4
-; CHECK-NEXT:    [[C_1:%.*]] = icmp sge i32 [[L_B]], 0
-; CHECK-NEXT:    br i1 [[C_1]], label %[[LOOP_LATCH]], label %[[LOOP_THEN:.*]]
-; CHECK:       [[LOOP_THEN]]:
-; CHECK-NEXT:    [[L_A:%.*]] = load i32, ptr [[A]], align 4
-; CHECK-NEXT:    br label %[[LOOP_LATCH]]
-; CHECK:       [[LOOP_LATCH]]:
-; CHECK-NEXT:    [[MERGE:%.*]] = phi i32 [ [[L_A]], %[[LOOP_THEN]] ], [ [[L_B]], %[[LOOP_HEADER]] ]
-; CHECK-NEXT:    [[GEP_C:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[IV]]
-; CHECK-NEXT:    store i32 [[MERGE]], ptr [[GEP_C]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]]
-; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    br [[EXIT:label %.*]]
+; CHECK:       [[SCALAR_PH:.*:]]
 ;
 entry:
   call void @llvm.assume(i1 true) [ "align"(ptr %a, i64 4), "dereferenceable"(ptr %a, i64 4) ]
@@ -216,29 +176,8 @@ define void @deref_assumption_too_small_in_header_constant_trip_count(ptr noalia
 ; CHECK-NEXT:    [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
 ; CHECK-NEXT:    br i1 [[TMP32]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br label %[[EXIT:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
-; CHECK:       [[LOOP_HEADER]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
-; CHECK-NEXT:    [[GEP_A:%.*]] = getelementptr i32, ptr [[A]], i64 [[IV]]
-; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[GEP_A]], i64 4), "dereferenceable"(ptr [[GEP_A]], i64 2) ]
-; CHECK-NEXT:    [[GEP_B:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]]
-; CHECK-NEXT:    [[L_B:%.*]] = load i32, ptr [[GEP_B]], align 4
-; CHECK-NEXT:    [[C_1:%.*]] = icmp sge i32 [[L_B]], 0
-; CHECK-NEXT:    br i1 [[C_1]], label %[[LOOP_LATCH]], label %[[LOOP_THEN:.*]]
-; CHECK:       [[LOOP_THEN]]:
-; CHECK-NEXT:    [[L_A:%.*]] = load i32, ptr [[GEP_A]], align 4
-; CHECK-NEXT:    br label %[[LOOP_LATCH]]
-; CHECK:       [[LOOP_LATCH]]:
-; CHECK-NEXT:    [[MERGE:%.*]] = phi i32 [ [[L_A]], %[[LOOP_THEN]] ], [ [[L_B]], %[[LOOP_HEADER]] ]
-; CHECK-NEXT:    [[GEP_C:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[IV]]
-; CHECK-NEXT:    store i32 [[MERGE]], ptr [[GEP_C]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]]
-; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    br [[EXIT:label %.*]]
+; CHECK:       [[SCALAR_PH:.*:]]
 ;
 entry:
   br label %loop.header
@@ -312,29 +251,8 @@ define void @deref_assumption_in_header_constant_trip_count_align_1(ptr noalias
 ; CHECK-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
 ; CHECK-NEXT:    br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br label %[[EXIT:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
-; CHECK:       [[LOOP_HEADER]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
-; CHECK-NEXT:    [[GEP_A:%.*]] = getelementptr i32, ptr [[A]], i64 [[IV]]
-; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "dereferenceable"(ptr [[GEP_A]], i64 4) ]
-; CHECK-NEXT:    [[GEP_B:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]]
-; CHECK-NEXT:    [[L_B:%.*]] = load i32, ptr [[GEP_B]], align 4
-; CHECK-NEXT:    [[C_1:%.*]] = icmp sge i32 [[L_B]], 0
-; CHECK-NEXT:    br i1 [[C_1]], label %[[LOOP_LATCH]], label %[[LOOP_THEN:.*]]
-; CHECK:       [[LOOP_THEN]]:
-; CHECK-NEXT:    [[L_A:%.*]] = load i32, ptr [[GEP_A]], align 1
-; CHECK-NEXT:    br label %[[LOOP_LATCH]]
-; CHECK:       [[LOOP_LATCH]]:
-; CHECK-NEXT:    [[MERGE:%.*]] = phi i32 [ [[L_A]], %[[LOOP_THEN]] ], [ [[L_B]], %[[LOOP_HEADER]] ]
-; CHECK-NEXT:    [[GEP_C:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[IV]]
-; CHECK-NEXT:    store i32 [[MERGE]], ptr [[GEP_C]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]]
-; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    br [[EXIT:label %.*]]
+; CHECK:       [[SCALAR_PH:.*:]]
 ;
 entry:
   br label %loop.header
@@ -408,29 +326,8 @@ define void @deref_assumption_in_header_constant_trip_count_align_via_arg_attrib
 ; CHECK-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
 ; CHECK-NEXT:    br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br label %[[EXIT:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
-; CHECK:       [[LOOP_HEADER]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
-; CHECK-NEXT:    [[GEP_A:%.*]] = getelementptr i32, ptr [[A]], i64 [[IV]]
-; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "dereferenceable"(ptr [[GEP_A]], i64 4) ]
-; CHECK-NEXT:    [[GEP_B:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]]
-; CHECK-NEXT:    [[L_B:%.*]] = load i32, ptr [[GEP_B]], align 4
-; CHECK-NEXT:    [[C_1:%.*]] = icmp sge i32 [[L_B]], 0
-; CHECK-NEXT:    br i1 [[C_1]], label %[[LOOP_LATCH]], label %[[LOOP_THEN:.*]]
-; CHECK:       [[LOOP_THEN]]:
-; CHECK-NEXT:    [[L_A:%.*]] = load i32, ptr [[GEP_A]], align 4
-; CHECK-NEXT:    br label %[[LOOP_LATCH]]
-; CHECK:       [[LOOP_LATCH]]:
-; CHECK-NEXT:    [[MERGE:%.*]] = phi i32 [ [[L_A]], %[[LOOP_THEN]] ], [ [[L_B]], %[[LOOP_HEADER]] ]
-; CHECK-NEXT:    [[GEP_C:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[IV]]
-; CHECK-NEXT:    store i32 [[MERGE]], ptr [[GEP_C]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]]
-; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    br [[EXIT:label %.*]]
+; CHECK:       [[SCALAR_PH:.*:]]
 ;
 entry:
   br label %loop.header
@@ -504,29 +401,8 @@ define void @deref_assumption_in_header_constant_trip_count_align_not_known(ptr
 ; CHECK-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
 ; CHECK-NEXT:    br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br label %[[EXIT:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
-; CHECK:       [[LOOP_HEADER]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
-; CHECK-NEXT:    [[GEP_A:%.*]] = getelementptr i32, ptr [[A]], i64 [[IV]]
-; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "dereferenceable"(ptr [[GEP_A]], i64 4) ]
-; CHECK-NEXT:    [[GEP_B:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]]
-; CHECK-NEXT:    [[L_B:%.*]] = load i32, ptr [[GEP_B]], align 4
-; CHECK-NEXT:    [[C_1:%.*]] = icmp sge i32 [[L_B]], 0
-; CHECK-NEXT:    br i1 [[C_1]], label %[[LOOP_LATCH]], label %[[LOOP_THEN:.*]]
-; CHECK:       [[LOOP_THEN]]:
-; CHECK-NEXT:    [[L_A:%.*]] = load i32, ptr [[GEP_A]], align 4
-; CHECK-NEXT:    br label %[[LOOP_LATCH]]
-; CHECK:       [[LOOP_LATCH]]:
-; CHECK-NEXT:    [[MERGE:%.*]] = phi i32 [ [[L_A]], %[[LOOP_THEN]] ], [ [[L_B]], %[[LOOP_HEADER]] ]
-; CHECK-NEXT:    [[GEP_C:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[IV]]
-; CHECK-NEXT:    store i32 [[MERGE]], ptr [[GEP_C]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]]
-; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    br [[EXIT:label %.*]]
+; CHECK:       [[SCALAR_PH:.*:]]
 ;
 entry:
   br label %loop.header
@@ -596,29 +472,8 @@ define void @deref_assumption_in_then_constant_trip_count(ptr noalias noundef %a
 ; CHECK-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
 ; CHECK-NEXT:    br i1 [[TMP28]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br label %[[EXIT:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
-; CHECK:       [[LOOP_HEADER]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
-; CHECK-NEXT:    [[GEP_B:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]]
-; CHECK-NEXT:    [[L_B:%.*]] = load i32, ptr [[GEP_B]], align 4
-; CHECK-NEXT:    [[C_1:%.*]] = icmp sge i32 [[L_B]], 0
-; CHECK-NEXT:    br i1 [[C_1]], label %[[LOOP_LATCH]], label %[[LOOP_THEN:.*]]
-; CHECK:       [[LOOP_THEN]]:
-; CHECK-NEXT:    [[GEP_A:%.*]] = getelementptr i32, ptr [[A]], i64 [[IV]]
-; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[GEP_A]], i64 4), "dereferenceable"(ptr [[GEP_A]], i64 4) ]
-; CHECK-NEXT:    [[L_A:%.*]] = load i32, ptr [[GEP_A]], align 4
-; CHECK-NEXT:    br label %[[LOOP_LATCH]]
-; CHECK:       [[LOOP_LATCH]]:
-; CHECK-NEXT:    [[MERGE:%.*]] = phi i32 [ [[L_A]], %[[LOOP_THEN]] ], [ [[L_B]], %[[LOOP_HEADER]] ]
-; CHECK-NEXT:    [[GEP_C:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[IV]]
-; CHECK-NEXT:    store i32 [[MERGE]], ptr [[GEP_C]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]]
-; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    br [[EXIT:label %.*]]
+; CHECK:       [[SCALAR_PH:.*:]]
 ;
 entry:
   br label %loop.header
@@ -692,29 +547,8 @@ define void @deref_assumption_in_latch_constant_trip_count(ptr noalias noundef %
 ; CHECK-NEXT:    [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
 ; CHECK-NEXT:    br i1 [[TMP32]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br label %[[EXIT:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
-; CHECK:       [[LOOP_HEADER]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
-; CHECK-NEXT:    [[GEP_A:%.*]] = getelementptr i32, ptr [[A]], i64 [[IV]]
-; CHECK-NEXT:    [[GEP_B:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]]
-; CHECK-NEXT:    [[L_B:%.*]] = load i32, ptr [[GEP_B]], align 4
-; CHECK-NEXT:    [[C_1:%.*]] = icmp sge i32 [[L_B]], 0
-; CHECK-NEXT:    br i1 [[C_1]], label %[[LOOP_LATCH]], label %[[LOOP_THEN:.*]]
-; CHECK:       [[LOOP_THEN]]:
-; CHECK-NEXT:    [[L_A:%.*]] = load i32, ptr [[GEP_A]], align 4
-; CHECK-NEXT:    br label %[[LOOP_LATCH]]
-; CHECK:       [[LOOP_LATCH]]:
-; CHECK-NEXT:    [[MERGE:%.*]] = phi i32 [ [[L_A]], %[[LOOP_THEN]] ], [ [[L_B]], %[[LOOP_HEADER]] ]
-; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[GEP_A]], i64 4), "dereferenceable"(ptr [[GEP_A]], i64 4) ]
-; CHECK-NEXT:    [[GEP_C:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[IV]]
-; CHECK-NEXT:    store i32 [[MERGE]], ptr [[GEP_C]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]]
-; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    br [[EXIT:label %.*]]
+; CHECK:       [[SCALAR_PH:.*:]]
 ;
 entry:
   br label %loop.header
@@ -747,7 +581,7 @@ exit:
 define void @deref_assumption_in_header_variable_trip_count(ptr noalias noundef %a, ptr noalias %b, ptr noalias %c, i64 %N) nofree nosync{
 ; CHECK-LABEL: define void @deref_assumption_in_header_variable_trip_count(
 ; CHECK-SAME: ptr noalias noundef [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]], i64 [[N:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 2
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
@@ -792,30 +626,8 @@ define void @deref_assumption_in_header_variable_trip_count(ptr noalias noundef
 ; CHECK-NEXT:    br i1 [[TMP32]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-NEXT:    br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
-; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
-; CHECK:       [[LOOP_HEADER]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
-; CHECK-NEXT:    [[GEP_A:%.*]] = getelementptr i32, ptr [[A]], i64 [[IV]]
-; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[GEP_A]], i64 4), "dereferenceable"(ptr [[GEP_A]], i64 4) ]
-; CHECK-NEXT:    [[GEP_B:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]]
-; CHECK-NEXT:    [[L_B:%.*]] = load i32, ptr [[GEP_B]], align 4
-; CHECK-NEXT:    [[C_1:%.*]] = icmp sge i32 [[L_B]], 0
-; CHECK-NEXT:    br i1 [[C_1]], label %[[LOOP_LATCH]], label %[[LOOP_THEN:.*]]
-; CHECK:       [[LOOP_THEN]]:
-; CHECK-NEXT:    [[L_A:%.*]] = load i32, ptr [[GEP_A]], align 4
-; CHECK-NEXT:    br label %[[LOOP_LATCH]]
-; CHECK:       [[LOOP_LATCH]]:
-; CHECK-NEXT:    [[MERGE:%.*]] = phi i32 [ [[L_A]], %[[LOOP_THEN]] ], [ [[L_B]], %[[LOOP_HEADER]] ]
-; CHECK-NEXT:    [[GEP_C:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[IV]]
-; CHECK-NEXT:    store i32 [[MERGE]], ptr [[GEP_C]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP11:![0-9]+]]
-; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    ret void
 ;
 entry:
   br label %loop.header
@@ -867,28 +679,8 @@ define void @deref_assumption_in_preheader_constant_trip_count_align_1(ptr noali
 ; CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
 ; CHECK-NEXT:    br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br label %[[EXIT:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
-; CHECK:       [[LOOP_HEADER]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
-; CHECK-NEXT:    [[GEP_B:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]]
-; CHECK-NEXT:    [[L_B:%.*]] = load i32, ptr [[GEP_B]], align 4
-; CHECK-NEXT:    [[C_1:%.*]] = icmp sge i32 [[L_B]], 0
-; CHECK-NEXT:    br i1 [[C_1]], label %[[LOOP_LATCH]], label %[[LOOP_THEN:.*]]
-; CHECK:       [[LOOP_THEN]]:
-; CHECK-NEXT:    [[GEP_A:%.*]] = getelementptr i32, ptr [[A]], i64 [[IV]]
-; CHECK-NEXT:    [[L_A:%.*]] = load i32, ptr [[GEP_A]], align 1
-; CHECK-NEXT:    br label %[[LOOP_LATCH]]
-; CHECK:       [[LOOP_LATCH]]:
-; CHECK-NEXT:    [[MERGE:%.*]] = phi i32 [ [[L_A]], %[[LOOP_THEN]] ], [ [[L_B]], %[[LOOP_HEADER]] ]
-; CHECK-NEXT:    [[GEP_C:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[IV]]
-; CHECK-NEXT:    store i32 [[MERGE]], ptr [[GEP_C]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]]
-; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    br [[EXIT:label %.*]]
+; CHECK:       [[SCALAR_PH:.*:]]
 ;
 entry:
   call void @llvm.assume(i1 true) [ "dereferenceable"(ptr %a, i64 4000) ]
@@ -958,28 +750,8 @@ define void @deref_assumption_too_small_in_preheader_constant_trip_count_align_1
 ; CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
 ; CHECK-NEXT:    br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br label %[[EXIT:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
-; CHECK:       [[LOOP_HEADER]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
-; CHECK-NEXT:    [[GEP_B:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]]
-; CHECK-NEXT:    [[L_B:%.*]] = load i32, ptr [[GEP_B]], align 4
-; CHECK-NEXT:    [[C_1:%.*]] = icmp sge i32 [[L_B]], 0
-; CHECK-NEXT:    br i1 [[C_1]], label %[[LOOP_LATCH]], label %[[LOOP_THEN:.*]]
-; CHECK:       [[LOOP_THEN]]:
-; CHECK-NEXT:    [[GEP_A:%.*]] = getelementptr i32, ptr [[A]], i64 [[IV]]
-; CHECK-NEXT:    [[L_A:%.*]] = load i32, ptr [[GEP_A]], align 1
-; CHECK-NEXT:    br label %[[LOOP_LATCH]]
-; CHECK:       [[LOOP_LATCH]]:
-; CHECK-NEXT:    [[MERGE:%.*]] = phi i32 [ [[L_A]], %[[LOOP_THEN]] ], [ [[L_B]], %[[LOOP_HEADER]] ]
-; CHECK-NEXT:    [[GEP_C:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[IV]]
-; CHECK-NEXT:    store i32 [[MERGE]], ptr [[GEP_C]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]]
-; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    br [[EXIT:label %.*]]
+; CHECK:       [[SCALAR_PH:.*:]]
 ;
 entry:
   call void @llvm.assume(i1 true) [ "dereferenceable"(ptr %a, i64 3999) ]
@@ -1031,28 +803,8 @@ define void @align_and_deref_assumption_in_preheader_constant_trip_count_align_4
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
 ; CHECK-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br label %[[EXIT:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
-; CHECK:       [[LOOP_HEADER]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
-; CHECK-NEXT:    [[GEP_B:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]]
-; CHECK-NEXT:    [[L_B:%.*]] = load i32, ptr [[GEP_B]], align 4
-; CHECK-NEXT:    [[C_1:%.*]] = icmp sge i32 [[L_B]], 0
-; CHECK-NEXT:    br i1 [[C_1]], label %[[LOOP_LATCH]], label %[[LOOP_THEN:.*]]
-; CHECK:       [[LOOP_THEN]]:
-; CHECK-NEXT:    [[GEP_A:%.*]] = getelementptr i32, ptr [[A]], i64 [[IV]]
-; CHECK-NEXT:    [[L_A:%.*]] = load i32, ptr [[GEP_A]], align 4
-; CHECK-NEXT:    br label %[[LOOP_LATCH]]
-; CHECK:       [[LOOP_LATCH]]:
-; CHECK-NEXT:    [[MERGE:%.*]] = phi i32 [ [[L_A]], %[[LOOP_THEN]] ], [ [[L_B]], %[[LOOP_HEADER]] ]
-; CHECK-NEXT:    [[GEP_C:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[IV]]
-; CHECK-NEXT:    store i32 [[MERGE]], ptr [[GEP_C]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]]
-; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    br [[EXIT:label %.*]]
+; CHECK:       [[SCALAR_PH:.*:]]
 ;
 entry:
   call void @llvm.assume(i1 true) [ "align"(ptr %a, i64 4), "dereferenceable"(ptr %a, i64 4000) ]
@@ -1105,28 +857,8 @@ define void @deref_assumption_in_preheader_constant_trip_count_align_4_known_via
 ; CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
 ; CHECK-NEXT:    br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br label %[[EXIT:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
-; CHECK:       [[LOOP_HEADER]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
-; CHECK-NEXT:    [[GEP_B:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]]
-; CHECK-NEXT:    [[L_B:%.*]] = load i32, ptr [[GEP_B]], align 4
-; CHECK-NEXT:    [[C_1:%.*]] = icmp sge i32 [[L_B]], 0
-; CHECK-NEXT:    br i1 [[C_1]], label %[[LOOP_LATCH]], label %[[LOOP_THEN:.*]]
-; CHECK:       [[LOOP_THEN]]:
-; CHECK-NEXT:    [[GEP_A:%.*]] = getelementptr i32, ptr [[A]], i64 [[IV]]
-; CHECK-NEXT:    [[L_A:%.*]] = load i32, ptr [[GEP_A]], align 4
-; CHECK-NEXT:    br label %[[LOOP_LATCH]]
-; CHECK:       [[LOOP_LATCH]]:
-; CHECK-NEXT:    [[MERGE:%.*]] = phi i32 [ [[L_A]], %[[LOOP_THEN]] ], [ [[L_B]], %[[LOOP_HEADER]] ]
-; CHECK-NEXT:    [[GEP_C:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[IV]]
-; CHECK-NEXT:    store i32 [[MERGE]], ptr [[GEP_C]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]]
-; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    br [[EXIT:label %.*]]
+; CHECK:       [[SCALAR_PH:.*:]]
 ;
 entry:
   call void @llvm.assume(i1 true) [ "dereferenceable"(ptr %a, i64 4000) ]
@@ -1196,28 +928,8 @@ define void @deref_assumption_in_preheader_constant_trip_count_align_4_not_known
 ; CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
 ; CHECK-NEXT:    br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br label %[[EXIT:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
-; CHECK:       [[LOOP_HEADER]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
-; CHECK-NEXT:    [[GEP_B:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]]
-; CHECK-NEXT:    [[L_B:%.*]] = load i32, ptr [[GEP_B]], align 4
-; CHECK-NEXT:    [[C_1:%.*]] = icmp sge i32 [[L_B]], 0
-; CHECK-NEXT:    br i1 [[C_1]], label %[[LOOP_LATCH]], label %[[LOOP_THEN:.*]]
-; CHECK:       [[LOOP_THEN]]:
-; CHECK-NEXT:    [[GEP_A:%.*]] = getelementptr i32, ptr [[A]], i64 [[IV]]
-; CHECK-NEXT:    [[L_A:%.*]] = load i32, ptr [[GEP_A]], align 4
-; CHECK-NEXT:    br label %[[LOOP_LATCH]]
-; CHECK:       [[LOOP_LATCH]]:
-; CHECK-NEXT:    [[MERGE:%.*]] = phi i32 [ [[L_A]], %[[LOOP_THEN]] ], [ [[L_B]], %[[LOOP_HEADER]] ]
-; CHECK-NEXT:    [[GEP_C:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[IV]]
-; CHECK-NEXT:    store i32 [[MERGE]], ptr [[GEP_C]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]]
-; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    br [[EXIT:label %.*]]
+; CHECK:       [[SCALAR_PH:.*:]]
 ;
 entry:
   call void @llvm.assume(i1 true) [ "dereferenceable"(ptr %a, i64 4000) ]
@@ -1287,28 +999,8 @@ define void @deref_assumption_too_small_in_preheader_constant_trip_count_align_4
 ; CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
 ; CHECK-NEXT:    br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br label %[[EXIT:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
-; CHECK:       [[LOOP_HEADER]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
-; CHECK-NEXT:    [[GEP_B:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]]
-; CHECK-NEXT:    [[L_B:%.*]] = load i32, ptr [[GEP_B]], align 4
-; CHECK-NEXT:    [[C_1:%.*]] = icmp sge i32 [[L_B]], 0
-; CHECK-NEXT:    br i1 [[C_1]], label %[[LOOP_LATCH]], label %[[LOOP_THEN:.*]]
-; CHECK:       [[LOOP_THEN]]:
-; CHECK-NEXT:    [[GEP_A:%.*]] = getelementptr i32, ptr [[A]], i64 [[IV]]
-; CHECK-NEXT:    [[L_A:%.*]] = load i32, ptr [[GEP_A]], align 4
-; CHECK-NEXT:    br label %[[LOOP_LATCH]]
-; CHECK:       [[LOOP_LATCH]]:
-; CHECK-NEXT:    [[MERGE:%.*]] = phi i32 [ [[L_A]], %[[LOOP_THEN]] ], [ [[L_B]], %[[LOOP_HEADER]] ]
-; CHECK-NEXT:    [[GEP_C:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[IV]]
-; CHECK-NEXT:    store i32 [[MERGE]], ptr [[GEP_C]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]]
-; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    br [[EXIT:label %.*]]
+; CHECK:       [[SCALAR_PH:.*:]]
 ;
 entry:
   call void @llvm.assume(i1 true) [ "dereferenceable"(ptr %a, i64 3999) ]
@@ -1376,27 +1068,8 @@ define void @may_free_align_deref_assumption_in_header_constant_trip_count_loop_
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
 ; CHECK-NEXT:    br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br label %[[EXIT:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
-; CHECK:       [[LOOP_HEADER]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
-; CHECK-NEXT:    [[GEP_B:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]]
-; CHECK-NEXT:    [[L_B:%.*]] = load i32, ptr [[GEP_B]], align 4
-; CHECK-NEXT:    [[C_1:%.*]] = icmp sge i32 [[L_B]], 0
-; CHECK-NEXT:    br i1 [[C_1]], label %[[LOOP_LATCH]], label %[[LOOP_THEN:.*]]
-; CHECK:       [[LOOP_THEN]]:
-; CHECK-NEXT:    [[L_A:%.*]] = load i32, ptr [[A]], align 4
-; CHECK-NEXT:    br label %[[LOOP_LATCH]]
-; CHECK:       [[LOOP_LATCH]]:
-; CHECK-NEXT:    [[MERGE:%.*]] = phi i32 [ [[L_A]], %[[LOOP_THEN]] ], [ [[L_B]], %[[LOOP_HEADER]] ]
-; CHECK-NEXT:    [[GEP_C:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[IV]]
-; CHECK-NEXT:    store i32 [[MERGE]], ptr [[GEP_C]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]]
-; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    br [[EXIT:label %.*]]
+; CHECK:       [[SCALAR_PH:.*:]]
 ;
 entry:
   call void @llvm.assume(i1 true) [ "align"(ptr %a, i64 4), "dereferenceable"(ptr %a, i64 4) ]
@@ -1465,27 +1138,8 @@ define void @may_free_local_ptr_align_deref_assumption_in_header_constant_trip_c
 ; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
 ; CHECK-NEXT:    br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br label %[[EXIT:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
-; CHECK:       [[LOOP_HEADER]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
-; CHECK-NEXT:    [[GEP_B:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]]
-; CHECK-NEXT:    [[L_B:%.*]] = load i32, ptr [[GEP_B]], align 4
-; CHECK-NEXT:    [[C_1:%.*]] = icmp sge i32 [[L_B]], 0
-; CHECK-NEXT:    br i1 [[C_1]], label %[[LOOP_LATCH]], label %[[LOOP_THEN:.*]]
-; CHECK:       [[LOOP_THEN]]:
-; CHECK-NEXT:    [[L_A:%.*]] = load i32, ptr [[A]], align 4
-; CHECK-NEXT:    br label %[[LOOP_LATCH]]
-; CHECK:       [[LOOP_LATCH]]:
-; CHECK-NEXT:    [[MERGE:%.*]] = phi i32 [ [[L_A]], %[[LOOP_THEN]] ], [ [[L_B]], %[[LOOP_HEADER]] ]
-; CHECK-NEXT:    [[GEP_C:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[IV]]
-; CHECK-NEXT:    store i32 [[MERGE]], ptr [[GEP_C]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]]
-; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    br [[EXIT:label %.*]]
+; CHECK:       [[SCALAR_PH:.*:]]
 ;
 entry:
   %a = call ptr @get_ptr()
@@ -1519,25 +1173,306 @@ exit:
 declare ptr @get_ptr()
 declare void @may_free()
 
-;.
-; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
-; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
-; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
-; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]], [[META2]]}
-; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
-; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]], [[META2]]}
-; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
-; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META1]], [[META2]]}
-; CHECK: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]}
-; CHECK: [[LOOP9]] = distinct !{[[LOOP9]], [[META1]], [[META2]]}
-; CHECK: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META2]]}
-; CHECK: [[LOOP11]] = distinct !{[[LOOP11]], [[META2]], [[META1]]}
-; CHECK: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]], [[META2]]}
-; CHECK: [[LOOP13]] = distinct !{[[LOOP13]], [[META1]], [[META2]]}
-; CHECK: [[LOOP14]] = distinct !{[[LOOP14]], [[META1]], [[META2]]}
-; CHECK: [[LOOP15]] = distinct !{[[LOOP15]], [[META1]], [[META2]]}
-; CHECK: [[LOOP16]] = distinct !{[[LOOP16]], [[META1]], [[META2]]}
-; CHECK: [[LOOP17]] = distinct !{[[LOOP17]], [[META1]], [[META2]]}
-; CHECK: [[LOOP18]] = distinct !{[[LOOP18]], [[META1]], [[META2]]}
-; CHECK: [[LOOP19]] = distinct !{[[LOOP19]], [[META1]], [[META2]]}
-;.
+define void @deref_assumption_in_header_constant_trip_count_nofree_via_context(ptr noalias noundef %a, ptr noalias %b, ptr noalias %c) nosync {
+; CHECK-LABEL: define void @deref_assumption_in_header_constant_trip_count_nofree_via_context(
+; CHECK-SAME: ptr noalias noundef [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR2:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[A]], i64 4), "dereferenceable"(ptr [[A]], i64 4000) ]
+; CHECK-NEXT:    br label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_LOAD_CONTINUE2:.*]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <2 x i32>, ptr [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp slt <2 x i32> [[WIDE_LOAD1]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i1> [[TMP2]], i32 0
+; CHECK-NEXT:    br i1 [[TMP13]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]]
+; CHECK:       [[PRED_LOAD_IF]]:
+; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i32, ptr [[A]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[TMP5]], i32 0
+; CHECK-NEXT:    br label %[[PRED_LOAD_CONTINUE]]
+; CHECK:       [[PRED_LOAD_CONTINUE]]:
+; CHECK-NEXT:    [[TMP7:%.*]] = phi <2 x i32> [ poison, %[[VECTOR_BODY]] ], [ [[TMP6]], %[[PRED_LOAD_IF]] ]
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x i1> [[TMP2]], i32 1
+; CHECK-NEXT:    br i1 [[TMP8]], label %[[PRED_LOAD_IF1:.*]], label %[[PRED_LOAD_CONTINUE2]]
+; CHECK:       [[PRED_LOAD_IF1]]:
+; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i32, ptr [[A]], i64 [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP10]], align 4
+; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <2 x i32> [[TMP7]], i32 [[TMP11]], i32 1
+; CHECK-NEXT:    br label %[[PRED_LOAD_CONTINUE2]]
+; CHECK:       [[PRED_LOAD_CONTINUE2]]:
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = phi <2 x i32> [ [[TMP7]], %[[PRED_LOAD_CONTINUE]] ], [ [[TMP12]], %[[PRED_LOAD_IF1]] ]
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP2]], <2 x i32> [[WIDE_LOAD]], <2 x i32> [[WIDE_LOAD1]]
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[INDEX]]
+; CHECK-NEXT:    store <2 x i32> [[PREDPHI]], ptr [[TMP14]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
+; CHECK-NEXT:    br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    br [[EXIT:label %.*]]
+; CHECK:       [[SCALAR_PH:.*:]]
+;
+entry:
+  call void @llvm.assume(i1 true) [ "align"(ptr %a, i64 4), "dereferenceable"(ptr %a, i64 4000) ]
+  br label %loop.header
+
+loop.header:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ]
+  %gep.a = getelementptr i32, ptr %a, i64 %iv
+  %gep.b = getelementptr inbounds i32, ptr %b, i64 %iv
+  %l.b = load i32, ptr %gep.b, align 4
+  %c.1 = icmp sge i32 %l.b, 0
+  br i1 %c.1, label %loop.latch, label %loop.then
+
+loop.then:
+  %l.a = load i32, ptr %gep.a, align 4
+  br label %loop.latch
+
+loop.latch:
+  %merge = phi i32 [ %l.a, %loop.then ], [ %l.b, %loop.header ]
+  %gep.c = getelementptr inbounds i32, ptr %c, i64 %iv
+  store i32 %merge, ptr %gep.c, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, 1000
+  br i1 %ec, label %exit, label %loop.header
+
+exit:
+  ret void
+}
+
+define void @deref_assumption_in_header_constant_trip_count_may_free(ptr noalias noundef %a, ptr noalias %b, ptr noalias %c) nosync {
+; CHECK-LABEL: define void @deref_assumption_in_header_constant_trip_count_may_free(
+; CHECK-SAME: ptr noalias noundef [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[A]], i64 4), "dereferenceable"(ptr [[A]], i64 4000) ]
+; CHECK-NEXT:    call void @may_free()
+; CHECK-NEXT:    br label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_LOAD_CONTINUE2:.*]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP3]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp slt <2 x i32> [[WIDE_LOAD]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i1> [[TMP4]], i32 0
+; CHECK-NEXT:    br i1 [[TMP5]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]]
+; CHECK:       [[PRED_LOAD_IF]]:
+; CHECK-NEXT:    [[TMP17:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i32, ptr [[A]], i64 [[TMP17]]
+; CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x i32> poison, i32 [[TMP7]], i32 0
+; CHECK-NEXT:    br label %[[PRED_LOAD_CONTINUE]]
+; CHECK:       [[PRED_LOAD_CONTINUE]]:
+; CHECK-NEXT:    [[TMP9:%.*]] = phi <2 x i32> [ poison, %[[VECTOR_BODY]] ], [ [[TMP8]], %[[PRED_LOAD_IF]] ]
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x i1> [[TMP4]], i32 1
+; CHECK-NEXT:    br i1 [[TMP10]], label %[[PRED_LOAD_IF1:.*]], label %[[PRED_LOAD_CONTINUE2]]
+; CHECK:       [[PRED_LOAD_IF1]]:
+; CHECK-NEXT:    [[TMP18:%.*]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i32, ptr [[A]], i64 [[TMP18]]
+; CHECK-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP11]], align 4
+; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <2 x i32> [[TMP9]], i32 [[TMP12]], i32 1
+; CHECK-NEXT:    br label %[[PRED_LOAD_CONTINUE2]]
+; CHECK:       [[PRED_LOAD_CONTINUE2]]:
+; CHECK-NEXT:    [[TMP14:%.*]] = phi <2 x i32> [ [[TMP9]], %[[PRED_LOAD_CONTINUE]] ], [ [[TMP13]], %[[PRED_LOAD_IF1]] ]
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP4]], <2 x i32> [[TMP14]], <2 x i32> [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[INDEX]]
+; CHECK-NEXT:    store <2 x i32> [[PREDPHI]], ptr [[TMP15]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
+; CHECK-NEXT:    br i1 [[TMP16]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    br [[EXIT:label %.*]]
+; CHECK:       [[SCALAR_PH:.*:]]
+;
+entry:
+  call void @llvm.assume(i1 true) [ "align"(ptr %a, i64 4), "dereferenceable"(ptr %a, i64 4000) ]
+  call void @may_free()
+  br label %loop.header
+
+loop.header:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ]
+  %gep.a = getelementptr i32, ptr %a, i64 %iv
+  %gep.b = getelementptr inbounds i32, ptr %b, i64 %iv
+  %l.b = load i32, ptr %gep.b, align 4
+  %c.1 = icmp sge i32 %l.b, 0
+  br i1 %c.1, label %loop.latch, label %loop.then
+
+loop.then:
+  %l.a = load i32, ptr %gep.a, align 4
+  br label %loop.latch
+
+loop.latch:
+  %merge = phi i32 [ %l.a, %loop.then ], [ %l.b, %loop.header ]
+  %gep.c = getelementptr inbounds i32, ptr %c, i64 %iv
+  store i32 %merge, ptr %gep.c, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, 1000
+  br i1 %ec, label %exit, label %loop.header
+
+exit:
+  ret void
+}
+
+define void @deref_assumption_in_header_constant_trip_count_nofree_via_context_but_missing_nosync(ptr noalias noundef %a, ptr noalias %b, ptr noalias %c) {
+; CHECK-LABEL: define void @deref_assumption_in_header_constant_trip_count_nofree_via_context_but_missing_nosync(
+; CHECK-SAME: ptr noalias noundef [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[A]], i64 4), "dereferenceable"(ptr [[A]], i64 4000) ]
+; CHECK-NEXT:    br label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_LOAD_CONTINUE2:.*]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp slt <2 x i32> [[WIDE_LOAD]], zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i1> [[TMP1]], i32 0
+; CHECK-NEXT:    br i1 [[TMP2]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]]
+; CHECK:       [[PRED_LOAD_IF]]:
+; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i32, ptr [[A]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[TMP5]], i32 0
+; CHECK-NEXT:    br label %[[PRED_LOAD_CONTINUE]]
+; CHECK:       [[PRED_LOAD_CONTINUE]]:
+; CHECK-NEXT:    [[TMP7:%.*]] = phi <2 x i32> [ poison, %[[VECTOR_BODY]] ], [ [[TMP6]], %[[PRED_LOAD_IF]] ]
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x i1> [[TMP1]], i32 1
+; CHECK-NEXT:    br i1 [[TMP8]], label %[[PRED_LOAD_IF1:.*]], label %[[PRED_LOAD_CONTINUE2]]
+; CHECK:       [[PRED_LOAD_IF1]]:
+; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i32, ptr [[A]], i64 [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP10]], align 4
+; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <2 x i32> [[TMP7]], i32 [[TMP11]], i32 1
+; CHECK-NEXT:    br label %[[PRED_LOAD_CONTINUE2]]
+; CHECK:       [[PRED_LOAD_CONTINUE2]]:
+; CHECK-NEXT:    [[TMP13:%.*]] = phi <2 x i32> [ [[TMP7]], %[[PRED_LOAD_CONTINUE]] ], [ [[TMP12]], %[[PRED_LOAD_IF1]] ]
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP1]], <2 x i32> [[TMP13]], <2 x i32> [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[INDEX]]
+; CHECK-NEXT:    store <2 x i32> [[PREDPHI]], ptr [[TMP14]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
+; CHECK-NEXT:    br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    br [[EXIT:label %.*]]
+; CHECK:       [[SCALAR_PH:.*:]]
+;
+entry:
+  call void @llvm.assume(i1 true) [ "align"(ptr %a, i64 4), "dereferenceable"(ptr %a, i64 4000) ]
+  br label %loop.header
+
+loop.header:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ]
+  %gep.a = getelementptr i32, ptr %a, i64 %iv
+  %gep.b = getelementptr inbounds i32, ptr %b, i64 %iv
+  %l.b = load i32, ptr %gep.b, align 4
+  %c.1 = icmp sge i32 %l.b, 0
+  br i1 %c.1, label %loop.latch, label %loop.then
+
+loop.then:
+  %l.a = load i32, ptr %gep.a, align 4
+  br label %loop.latch
+
+loop.latch:
+  %merge = phi i32 [ %l.a, %loop.then ], [ %l.b, %loop.header ]
+  %gep.c = getelementptr inbounds i32, ptr %c, i64 %iv
+  store i32 %merge, ptr %gep.c, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, 1000
+  br i1 %ec, label %exit, label %loop.header
+
+exit:
+  ret void
+}
+
+define void @deref_assumption_in_header_constant_trip_count_multiple_loop_predecessors(ptr noalias noundef %a, ptr noalias %b, ptr noalias %c, i1 %pre) nosync {
+; CHECK-LABEL: define void @deref_assumption_in_header_constant_trip_count_multiple_loop_predecessors(
+; CHECK-SAME: ptr noalias noundef [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]], i1 [[PRE:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[A]], i64 4), "dereferenceable"(ptr [[A]], i64 4000) ]
+; CHECK-NEXT:    br i1 [[PRE]], label %[[THEN:.*]], label %[[ELSE:.*]]
+; CHECK:       [[THEN]]:
+; CHECK-NEXT:    store i32 0, ptr [[A]], align 4
+; CHECK-NEXT:    br label %[[LOOP_HEADER_PREHEADER:.*]]
+; CHECK:       [[ELSE]]:
+; CHECK-NEXT:    store i32 0, ptr [[B]], align 4
+; CHECK-NEXT:    br label %[[LOOP_HEADER_PREHEADER]]
+; CHECK:       [[LOOP_HEADER_PREHEADER]]:
+; CHECK-NEXT:    br label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_LOAD_CONTINUE2:.*]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp slt <2 x i32> [[WIDE_LOAD]], zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i1> [[TMP1]], i32 0
+; CHECK-NEXT:    br i1 [[TMP2]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]]
+; CHECK:       [[PRED_LOAD_IF]]:
+; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i32, ptr [[A]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[TMP5]], i32 0
+; CHECK-NEXT:    br label %[[PRED_LOAD_CONTINUE]]
+; CHECK:       [[PRED_LOAD_CONTINUE]]:
+; CHECK-NEXT:    [[TMP7:%.*]] = phi <2 x i32> [ poison, %[[VECTOR_BODY]] ], [ [[TMP6]], %[[PRED_LOAD_IF]] ]
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x i1> [[TMP1]], i32 1
+; CHECK-NEXT:    br i1 [[TMP8]], label %[[PRED_LOAD_IF1:.*]], label %[[PRED_LOAD_CONTINUE2]]
+; CHECK:       [[PRED_LOAD_IF1]]:
+; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i32, ptr [[A]], i64 [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP10]], align 4
+; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <2 x i32> [[TMP7]], i32 [[TMP11]], i32 1
+; CHECK-NEXT:    br label %[[PRED_LOAD_CONTINUE2]]
+; CHECK:       [[PRED_LOAD_CONTINUE2]]:
+; CHECK-NEXT:    [[TMP13:%.*]] = phi <2 x i32> [ [[TMP7]], %[[PRED_LOAD_CONTINUE]] ], [ [[TMP12]], %[[PRED_LOAD_IF1]] ]
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP1]], <2 x i32> [[TMP13]], <2 x i32> [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[INDEX]]
+; CHECK-NEXT:    store <2 x i32> [[PREDPHI]], ptr [[TMP14]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
+; CHECK-NEXT:    br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    br [[EXIT:label %.*]]
+; CHECK:       [[SCALAR_PH:.*:]]
+;
+entry:
+  call void @llvm.assume(i1 true) [ "align"(ptr %a, i64 4), "dereferenceable"(ptr %a, i64 4000) ]
+  br i1 %pre, label %then, label %else
+
+then:
+  store i32 0, ptr %a
+  br label %loop.header
+
+else:
+  store i32 0, ptr %b
+  br label %loop.header
+
+loop.header:
+  %iv = phi i64 [ 0, %then ], [ 0, %else ], [ %iv.next, %loop.latch ]
+  %gep.a = getelementptr i32, ptr %a, i64 %iv
+  %gep.b = getelementptr inbounds i32, ptr %b, i64 %iv
+  %l.b = load i32, ptr %gep.b, align 4
+  %c.1 = icmp sge i32 %l.b, 0
+  br i1 %c.1, label %loop.latch, label %loop.then
+
+loop.then:
+  %l.a = load i32, ptr %gep.a, align 4
+  br label %loop.latch
+
+loop.latch:
+  %merge = phi i32 [ %l.a, %loop.then ], [ %l.b, %loop.header ]
+  %gep.c = getelementptr inbounds i32, ptr %c, i64 %iv
+  store i32 %merge, ptr %gep.c, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, 1000
+  br i1 %ec, label %exit, label %loop.header
+
+exit:
+  ret void
+}
+
+
diff --git a/llvm/test/Transforms/LoopVectorize/reuse-lcssa-phi-scev-expansion.ll b/llvm/test/Transforms/LoopVectorize/reuse-lcssa-phi-scev-expansion.ll
index cb0c778..73d5e26 100644
--- a/llvm/test/Transforms/LoopVectorize/reuse-lcssa-phi-scev-expansion.ll
+++ b/llvm/test/Transforms/LoopVectorize/reuse-lcssa-phi-scev-expansion.ll
@@ -220,14 +220,18 @@ define void @expand_diff_scev_unknown(ptr %dst, i1 %invar.c, i32 %step) mustprog
 ; CHECK-NEXT:    [[UMAX:%.*]] = call i32 @llvm.umax.i32(i32 [[STEP]], i32 1)
 ; CHECK-NEXT:    [[TMP8:%.*]] = udiv i32 [[TMP7]], [[UMAX]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = add i32 [[TMP6]], [[TMP8]]
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP9]], 2
+; CHECK-NEXT:    [[TMP12:%.*]] = add i32 [[INDVAR_LCSSA1]], 2
+; CHECK-NEXT:    [[SMAX1:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP12]], i32 0)
+; CHECK-NEXT:    [[TMP14:%.*]] = add i32 [[TMP3]], -1
+; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[SMAX1]], [[TMP14]]
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP15]], 2
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_SCEVCHECK:.*]]
 ; CHECK:       [[VECTOR_SCEVCHECK]]:
 ; CHECK-NEXT:    [[IDENT_CHECK:%.*]] = icmp ne i32 [[STEP]], 1
 ; CHECK-NEXT:    br i1 [[IDENT_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[TMP9]], 2
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[TMP9]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[TMP15]], 2
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[TMP15]], [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = add i32 [[IV_1_LCSSA]], [[N_VEC]]
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
@@ -239,7 +243,7 @@ define void @expand_diff_scev_unknown(ptr %dst, i1 %invar.c, i32 %step) mustprog
 ; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[TMP9]], [[N_VEC]]
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[TMP15]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[TMP10]], %[[MIDDLE_BLOCK]] ], [ [[IV_1_LCSSA]], %[[LOOP_2_PREHEADER]] ], [ [[IV_1_LCSSA]], %[[VECTOR_SCEVCHECK]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/version-stride-with-integer-casts.ll b/llvm/test/Transforms/LoopVectorize/version-stride-with-integer-casts.ll
index 0b86a22..027dcaf 100644
--- a/llvm/test/Transforms/LoopVectorize/version-stride-with-integer-casts.ll
+++ b/llvm/test/Transforms/LoopVectorize/version-stride-with-integer-casts.ll
@@ -22,13 +22,11 @@ define void @test_versioned_with_sext_use(i32 %offset, ptr %dst) {
 ; CHECK-NEXT:    [[IDENT_CHECK:%.*]] = icmp ne i32 [[OFFSET]], 1
 ; CHECK-NEXT:    br i1 [[IDENT_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[TMP0:%.*]] = mul i64 200, [[OFFSET_EXT]]
-; CHECK-NEXT:    [[IND_END:%.*]] = add i64 [[IV_1]], [[TMP0]]
+; CHECK-NEXT:    [[IND_END:%.*]] = add i64 [[IV_1]], 200
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[INDEX]], [[OFFSET_EXT]]
-; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[IV_1]], [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[IV_1]], [[INDEX]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP3]]
 ; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr [[TMP4]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
@@ -94,13 +92,11 @@ define void @test_versioned_with_zext_use(i32 %offset, ptr %dst) {
 ; CHECK-NEXT:    [[IDENT_CHECK:%.*]] = icmp ne i32 [[OFFSET]], 1
 ; CHECK-NEXT:    br i1 [[IDENT_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[TMP0:%.*]] = mul i64 200, [[OFFSET_EXT]]
-; CHECK-NEXT:    [[IND_END:%.*]] = add i64 [[IV_1]], [[TMP0]]
+; CHECK-NEXT:    [[IND_END:%.*]] = add i64 [[IV_1]], 200
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[INDEX]], [[OFFSET_EXT]]
-; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[IV_1]], [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[IV_1]], [[INDEX]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP3]]
 ; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr [[TMP4]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
@@ -233,13 +229,11 @@ define void @test_versioned_with_different_uses(i32 %offset, ptr noalias %dst.1,
 ; CHECK-NEXT:    [[IDENT_CHECK:%.*]] = icmp ne i32 [[OFFSET]], 1
 ; CHECK-NEXT:    br i1 [[IDENT_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[TMP0:%.*]] = mul i64 200, [[OFFSET_EXT]]
-; CHECK-NEXT:    [[IND_END:%.*]] = add i64 [[IV_1]], [[TMP0]]
+; CHECK-NEXT:    [[IND_END:%.*]] = add i64 [[IV_1]], 200
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[INDEX]], [[OFFSET_EXT]]
-; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[IV_1]], [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[IV_1]], [[INDEX]]
 ; CHECK-NEXT:    [[OFFSET_IDX2:%.*]] = trunc i64 [[INDEX]] to i32
 ; CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[OFFSET_IDX2]], 0
 ; CHECK-NEXT:    [[TMP5:%.*]] = add i32 [[OFFSET_IDX2]], 1
@@ -414,26 +408,20 @@ define void @zext_of_i1_stride(i1 %g, ptr %dst) mustprogress {
 ; CHECK-NEXT:    [[IDENT_CHECK:%.*]] = icmp ne i1 [[G]], true
 ; CHECK-NEXT:    br i1 [[IDENT_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP1]], 4
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP1]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], [[G_64]]
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], [[G_64]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i16, ptr [[DST]], i64 [[OFFSET_IDX]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i16, ptr [[DST]], i64 [[INDEX]]
 ; CHECK-NEXT:    store <4 x i16> splat (i16 1), ptr [[TMP4]], align 2
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
+; CHECK-NEXT:    br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[EXIT:%.*]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i16, ptr [[DST]], i64 [[IV]]
 ; CHECK-NEXT:    store i16 [[G_16]], ptr [[GEP]], align 2
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], [[G_64]]
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/indvars-vectorization.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/indvars-vectorization.ll
index b056f44..8d20a3b 100644
--- a/llvm/test/Transforms/PhaseOrdering/AArch64/indvars-vectorization.ll
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/indvars-vectorization.ll
@@ -14,16 +14,9 @@ define void @s172(i32 noundef %xa, i32 noundef %xb, ptr noundef %a, ptr noundef
 ; CHECK-NEXT:    [[SUB:%.*]] = add i32 [[XA]], -1
 ; CHECK-NEXT:    [[TMP0:%.*]] = sext i32 [[SUB]] to i64
 ; CHECK-NEXT:    [[TMP1:%.*]] = sext i32 [[XB]] to i64
-; CHECK-NEXT:    [[TMP2:%.*]] = add nsw i64 [[TMP1]], [[TMP0]]
-; CHECK-NEXT:    [[SMAX7:%.*]] = tail call i64 @llvm.smax.i64(i64 [[TMP2]], i64 32000)
-; CHECK-NEXT:    [[TMP3:%.*]] = icmp slt i64 [[TMP2]], 32000
-; CHECK-NEXT:    [[UMIN8:%.*]] = zext i1 [[TMP3]] to i64
-; CHECK-NEXT:    [[TMP4:%.*]] = add nsw i64 [[TMP2]], [[UMIN8]]
-; CHECK-NEXT:    [[TMP5:%.*]] = sub i64 [[SMAX7]], [[TMP4]]
-; CHECK-NEXT:    [[UMAX9:%.*]] = tail call i64 @llvm.umax.i64(i64 [[TMP1]], i64 1)
-; CHECK-NEXT:    [[TMP6:%.*]] = udiv i64 [[TMP5]], [[UMAX9]]
-; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[TMP6]], [[UMIN8]]
-; CHECK-NEXT:    [[TMP8:%.*]] = add i64 [[TMP7]], 1
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call i64 @llvm.smax.i64(i64 [[TMP0]], i64 31999)
+; CHECK-NEXT:    [[SMAX10:%.*]] = add nuw nsw i64 [[TMP2]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = sub i64 [[SMAX10]], [[TMP0]]
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ugt i64 [[TMP8]], 23
 ; CHECK-NEXT:    [[IDENT_CHECK_NOT:%.*]] = icmp eq i32 [[XB]], 1
 ; CHECK-NEXT:    [[OR_COND:%.*]] = and i1 [[MIN_ITERS_CHECK]], [[IDENT_CHECK_NOT]]
@@ -50,13 +43,11 @@ define void @s172(i32 noundef %xa, i32 noundef %xb, ptr noundef %a, ptr noundef
 ; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label [[FOR_BODY_PREHEADER13]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP8]], -8
-; CHECK-NEXT:    [[TMP18:%.*]] = mul nuw i64 [[N_VEC]], [[TMP1]]
-; CHECK-NEXT:    [[IND_END:%.*]] = add i64 [[TMP18]], [[TMP0]]
+; CHECK-NEXT:    [[IND_END:%.*]] = add i64 [[N_VEC]], [[TMP0]]
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP19:%.*]] = mul nuw i64 [[INDEX]], [[TMP1]]
-; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 [[TMP19]], [[TMP0]]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 [[INDEX]], [[TMP0]]
 ; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[OFFSET_IDX]]
 ; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP20]], i64 16
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP20]], align 4, !alias.scope [[META0:![0-9]+]]
@@ -75,7 +66,7 @@ define void @s172(i32 noundef %xa, i32 noundef %xb, ptr noundef %a, ptr noundef
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP8]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END]], label [[FOR_BODY_PREHEADER13]]
-; CHECK:       for.body.preheader13:
+; CHECK:       for.body.preheader14:
 ; CHECK-NEXT:    [[INDVARS_IV_PH:%.*]] = phi i64 [ [[TMP0]], [[VECTOR_MEMCHECK]] ], [ [[TMP0]], [[FOR_BODY_PREHEADER]] ], [ [[IND_END]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/xor-combined-opcode.ll b/llvm/test/Transforms/SLPVectorizer/X86/xor-combined-opcode.ll
new file mode 100644
index 0000000..9cdcdf1
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/xor-combined-opcode.ll
@@ -0,0 +1,21 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -S --passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu < %s -slp-threshold=-100 | FileCheck %s
+define i1 @foo(i1 %v) {            ; assume %v is 1
+; CHECK-LABEL: define i1 @foo(
+; CHECK-SAME: i1 [[V:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x i1> poison, i1 [[V]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x i1> [[TMP0]], <2 x i1> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = xor <2 x i1> zeroinitializer, [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i1> [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i1> [[TMP2]], i32 1
+; CHECK-NEXT:    [[SUB:%.*]] = sub i1 [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    ret i1 [[SUB]]
+;
+entry:
+  %not = xor i1 %v, 1              ; 0
+  %not1 = xor i1 %not, 1           ; 1
+  %mul = mul i1 %v, 1              ; 1
+  %sub = sub i1 %not1, %mul        ; 0
+  ret i1 %sub                      ; 0
+}
diff --git a/llvm/test/Transforms/SimplifyCFG/switch-to-select-two-case.ll b/llvm/test/Transforms/SimplifyCFG/switch-to-select-two-case.ll
index 39703e9..9d78b97 100644
--- a/llvm/test/Transforms/SimplifyCFG/switch-to-select-two-case.ll
+++ b/llvm/test/Transforms/SimplifyCFG/switch-to-select-two-case.ll
@@ -755,6 +755,25 @@ bb3:
   ret i1 %phi
 }
 
+define i32 @negative_constfold_select() {
+; CHECK-LABEL: @negative_constfold_select(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    ret i32 poison
+;
+entry:
+  switch i32 poison, label %default [
+  i32 0, label %bb
+  i32 2, label %bb
+  ]
+
+bb:
+  br label %default
+
+default:
+  %ret = phi i32 [ poison, %entry ], [ poison, %bb ]
+  ret i32 %ret
+}
+
 !0 = !{!"function_entry_count", i64 1000}
 !1 = !{!"branch_weights", i32 3, i32 5, i32 7}
 !2 = !{!"branch_weights", i32 3, i32 5, i32 7, i32 11, i32 13}
diff --git a/llvm/test/Transforms/VectorCombine/AArch64/combine-shuffle-ext.ll b/llvm/test/Transforms/VectorCombine/AArch64/combine-shuffle-ext.ll
index 6341c89..1503a1b 100644
--- a/llvm/test/Transforms/VectorCombine/AArch64/combine-shuffle-ext.ll
+++ b/llvm/test/Transforms/VectorCombine/AArch64/combine-shuffle-ext.ll
@@ -14,9 +14,9 @@ define <4 x i32> @load_i32_zext_to_v4i32(ptr %di) {
 ; CHECK-NEXT:    [[L:%.*]] = load i32, ptr [[DI]], align 4
 ; CHECK-NEXT:    [[VEC_INS:%.*]] = insertelement <2 x i32> <i32 poison, i32 0>, i32 [[L]], i64 0
 ; CHECK-NEXT:    [[VEC_BC:%.*]] = bitcast <2 x i32> [[VEC_INS]] to <8 x i8>
-; CHECK-NEXT:    [[E_1:%.*]] = zext <8 x i8> [[VEC_BC]] to <8 x i16>
-; CHECK-NEXT:    [[VEC_SHUFFLE:%.*]] = shufflevector <8 x i16> [[E_1]], <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[EXT_2:%.*]] = zext nneg <4 x i16> [[VEC_SHUFFLE]] to <4 x i32>
+; CHECK-NEXT:    [[VEC_SHUFFLE:%.*]] = shufflevector <8 x i8> [[VEC_BC]], <8 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[E_1:%.*]] = zext <4 x i8> [[VEC_SHUFFLE]] to <4 x i16>
+; CHECK-NEXT:    [[EXT_2:%.*]] = zext nneg <4 x i16> [[E_1]] to <4 x i32>
 ; CHECK-NEXT:    ret <4 x i32> [[EXT_2]]
 ;
 entry:
@@ -36,9 +36,9 @@ define <4 x i32> @load_i32_zext_to_v4i32_both_nneg(ptr %di) {
 ; CHECK-NEXT:    [[L:%.*]] = load i32, ptr [[DI]], align 4
 ; CHECK-NEXT:    [[VEC_INS:%.*]] = insertelement <2 x i32> <i32 poison, i32 0>, i32 [[L]], i64 0
 ; CHECK-NEXT:    [[VEC_BC:%.*]] = bitcast <2 x i32> [[VEC_INS]] to <8 x i8>
-; CHECK-NEXT:    [[E_1:%.*]] = zext nneg <8 x i8> [[VEC_BC]] to <8 x i16>
-; CHECK-NEXT:    [[VEC_SHUFFLE:%.*]] = shufflevector <8 x i16> [[E_1]], <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[EXT_2:%.*]] = zext nneg <4 x i16> [[VEC_SHUFFLE]] to <4 x i32>
+; CHECK-NEXT:    [[VEC_SHUFFLE:%.*]] = shufflevector <8 x i8> [[VEC_BC]], <8 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[E_1:%.*]] = zext nneg <4 x i8> [[VEC_SHUFFLE]] to <4 x i16>
+; CHECK-NEXT:    [[EXT_2:%.*]] = zext nneg <4 x i16> [[E_1]] to <4 x i32>
 ; CHECK-NEXT:    ret <4 x i32> [[EXT_2]]
 ;
 entry:
@@ -58,9 +58,9 @@ define <4 x i32> @load_i32_zext_to_v4i32_inner_nneg(ptr %di) {
 ; CHECK-NEXT:    [[L:%.*]] = load i32, ptr [[DI]], align 4
 ; CHECK-NEXT:    [[VEC_INS:%.*]] = insertelement <2 x i32> <i32 poison, i32 0>, i32 [[L]], i64 0
 ; CHECK-NEXT:    [[VEC_BC:%.*]] = bitcast <2 x i32> [[VEC_INS]] to <8 x i8>
-; CHECK-NEXT:    [[E_1:%.*]] = zext nneg <8 x i8> [[VEC_BC]] to <8 x i16>
-; CHECK-NEXT:    [[VEC_SHUFFLE:%.*]] = shufflevector <8 x i16> [[E_1]], <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[EXT_2:%.*]] = zext <4 x i16> [[VEC_SHUFFLE]] to <4 x i32>
+; CHECK-NEXT:    [[VEC_SHUFFLE:%.*]] = shufflevector <8 x i8> [[VEC_BC]], <8 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[E_1:%.*]] = zext nneg <4 x i8> [[VEC_SHUFFLE]] to <4 x i16>
+; CHECK-NEXT:    [[EXT_2:%.*]] = zext <4 x i16> [[E_1]] to <4 x i32>
 ; CHECK-NEXT:    ret <4 x i32> [[EXT_2]]
 ;
 entry:
@@ -80,9 +80,9 @@ define <4 x i32> @load_i32_zext_to_v4i32_outer_nneg(ptr %di) {
 ; CHECK-NEXT:    [[L:%.*]] = load i32, ptr [[DI]], align 4
 ; CHECK-NEXT:    [[VEC_INS:%.*]] = insertelement <2 x i32> <i32 poison, i32 0>, i32 [[L]], i64 0
 ; CHECK-NEXT:    [[VEC_BC:%.*]] = bitcast <2 x i32> [[VEC_INS]] to <8 x i8>
-; CHECK-NEXT:    [[E_1:%.*]] = zext <8 x i8> [[VEC_BC]] to <8 x i16>
-; CHECK-NEXT:    [[VEC_SHUFFLE:%.*]] = shufflevector <8 x i16> [[E_1]], <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[EXT_2:%.*]] = zext nneg <4 x i16> [[VEC_SHUFFLE]] to <4 x i32>
+; CHECK-NEXT:    [[VEC_SHUFFLE:%.*]] = shufflevector <8 x i8> [[VEC_BC]], <8 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[E_1:%.*]] = zext <4 x i8> [[VEC_SHUFFLE]] to <4 x i16>
+; CHECK-NEXT:    [[EXT_2:%.*]] = zext nneg <4 x i16> [[E_1]] to <4 x i32>
 ; CHECK-NEXT:    ret <4 x i32> [[EXT_2]]
 ;
 entry:
@@ -102,9 +102,9 @@ define <4 x i32> @load_i32_zext_to_v4i32_inner_nneg_outer_sext(ptr %di) {
 ; CHECK-NEXT:    [[L:%.*]] = load i32, ptr [[DI]], align 4
 ; CHECK-NEXT:    [[VEC_INS:%.*]] = insertelement <2 x i32> <i32 poison, i32 0>, i32 [[L]], i64 0
 ; CHECK-NEXT:    [[VEC_BC:%.*]] = bitcast <2 x i32> [[VEC_INS]] to <8 x i8>
-; CHECK-NEXT:    [[E_1:%.*]] = zext nneg <8 x i8> [[VEC_BC]] to <8 x i16>
-; CHECK-NEXT:    [[VEC_SHUFFLE:%.*]] = shufflevector <8 x i16> [[E_1]], <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[EXT_2:%.*]] = sext <4 x i16> [[VEC_SHUFFLE]] to <4 x i32>
+; CHECK-NEXT:    [[VEC_SHUFFLE:%.*]] = shufflevector <8 x i8> [[VEC_BC]], <8 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[E_1:%.*]] = zext nneg <4 x i8> [[VEC_SHUFFLE]] to <4 x i16>
+; CHECK-NEXT:    [[EXT_2:%.*]] = sext <4 x i16> [[E_1]] to <4 x i32>
 ; CHECK-NEXT:    ret <4 x i32> [[EXT_2]]
 ;
 entry:
@@ -125,9 +125,9 @@ define <4 x i32> @load_i32_zext_to_v4i32_clobber_after_load(ptr %di) {
 ; CHECK-NEXT:    call void @use.i32(i32 0)
 ; CHECK-NEXT:    [[VEC_INS:%.*]] = insertelement <2 x i32> <i32 poison, i32 0>, i32 [[L]], i64 0
 ; CHECK-NEXT:    [[VEC_BC:%.*]] = bitcast <2 x i32> [[VEC_INS]] to <8 x i8>
-; CHECK-NEXT:    [[E_1:%.*]] = zext <8 x i8> [[VEC_BC]] to <8 x i16>
-; CHECK-NEXT:    [[VEC_SHUFFLE:%.*]] = shufflevector <8 x i16> [[E_1]], <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[EXT_2:%.*]] = zext nneg <4 x i16> [[VEC_SHUFFLE]] to <4 x i32>
+; CHECK-NEXT:    [[VEC_SHUFFLE:%.*]] = shufflevector <8 x i8> [[VEC_BC]], <8 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[E_1:%.*]] = zext <4 x i8> [[VEC_SHUFFLE]] to <4 x i16>
+; CHECK-NEXT:    [[EXT_2:%.*]] = zext nneg <4 x i16> [[E_1]] to <4 x i32>
 ; CHECK-NEXT:    ret <4 x i32> [[EXT_2]]
 ;
 entry:
@@ -148,9 +148,9 @@ define <4 x i32> @load_i32_sext_zext_to_v4i32(ptr %di) {
 ; CHECK-NEXT:    [[L:%.*]] = load i32, ptr [[DI]], align 4
 ; CHECK-NEXT:    [[VEC_INS:%.*]] = insertelement <2 x i32> <i32 poison, i32 0>, i32 [[L]], i64 0
 ; CHECK-NEXT:    [[VEC_BC:%.*]] = bitcast <2 x i32> [[VEC_INS]] to <8 x i8>
-; CHECK-NEXT:    [[E_1:%.*]] = sext <8 x i8> [[VEC_BC]] to <8 x i16>
-; CHECK-NEXT:    [[VEC_SHUFFLE:%.*]] = shufflevector <8 x i16> [[E_1]], <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[EXT_2:%.*]] = zext <4 x i16> [[VEC_SHUFFLE]] to <4 x i32>
+; CHECK-NEXT:    [[VEC_SHUFFLE:%.*]] = shufflevector <8 x i8> [[VEC_BC]], <8 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[E_1:%.*]] = sext <4 x i8> [[VEC_SHUFFLE]] to <4 x i16>
+; CHECK-NEXT:    [[EXT_2:%.*]] = zext <4 x i16> [[E_1]] to <4 x i32>
 ; CHECK-NEXT:    ret <4 x i32> [[EXT_2]]
 ;
 entry:
@@ -170,9 +170,9 @@ define <4 x i32> @load_i32_zext_to_v4i32_load_other_users(ptr %di) {
 ; CHECK-NEXT:    [[L:%.*]] = load i32, ptr [[DI]], align 4
 ; CHECK-NEXT:    [[VEC_INS:%.*]] = insertelement <2 x i32> <i32 poison, i32 0>, i32 [[L]], i64 0
 ; CHECK-NEXT:    [[VEC_BC:%.*]] = bitcast <2 x i32> [[VEC_INS]] to <8 x i8>
-; CHECK-NEXT:    [[E_1:%.*]] = zext <8 x i8> [[VEC_BC]] to <8 x i16>
-; CHECK-NEXT:    [[VEC_SHUFFLE:%.*]] = shufflevector <8 x i16> [[E_1]], <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[EXT_2:%.*]] = zext nneg <4 x i16> [[VEC_SHUFFLE]] to <4 x i32>
+; CHECK-NEXT:    [[VEC_SHUFFLE:%.*]] = shufflevector <8 x i8> [[VEC_BC]], <8 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[E_1:%.*]] = zext <4 x i8> [[VEC_SHUFFLE]] to <4 x i16>
+; CHECK-NEXT:    [[EXT_2:%.*]] = zext nneg <4 x i16> [[E_1]] to <4 x i32>
 ; CHECK-NEXT:    call void @use.i32(i32 [[L]])
 ; CHECK-NEXT:    ret <4 x i32> [[EXT_2]]
 ;
@@ -194,9 +194,9 @@ define <4 x i32> @load_i32_zext_to_v4i32_ins_other_users(ptr %di) {
 ; CHECK-NEXT:    [[L:%.*]] = load i32, ptr [[DI]], align 4
 ; CHECK-NEXT:    [[VEC_INS:%.*]] = insertelement <2 x i32> <i32 poison, i32 0>, i32 [[L]], i64 0
 ; CHECK-NEXT:    [[VEC_BC:%.*]] = bitcast <2 x i32> [[VEC_INS]] to <8 x i8>
-; CHECK-NEXT:    [[E_1:%.*]] = zext <8 x i8> [[VEC_BC]] to <8 x i16>
-; CHECK-NEXT:    [[VEC_SHUFFLE:%.*]] = shufflevector <8 x i16> [[E_1]], <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[EXT_2:%.*]] = zext nneg <4 x i16> [[VEC_SHUFFLE]] to <4 x i32>
+; CHECK-NEXT:    [[VEC_SHUFFLE:%.*]] = shufflevector <8 x i8> [[VEC_BC]], <8 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[E_1:%.*]] = zext <4 x i8> [[VEC_SHUFFLE]] to <4 x i16>
+; CHECK-NEXT:    [[EXT_2:%.*]] = zext nneg <4 x i16> [[E_1]] to <4 x i32>
 ; CHECK-NEXT:    call void @use.v2i32(<2 x i32> [[VEC_INS]])
 ; CHECK-NEXT:    ret <4 x i32> [[EXT_2]]
 ;
@@ -218,9 +218,9 @@ define <4 x i32> @load_i32_zext_to_v4i32_bc_other_users(ptr %di) {
 ; CHECK-NEXT:    [[L:%.*]] = load i32, ptr [[DI]], align 4
 ; CHECK-NEXT:    [[VEC_INS:%.*]] = insertelement <2 x i32> <i32 poison, i32 0>, i32 [[L]], i64 0
 ; CHECK-NEXT:    [[VEC_BC:%.*]] = bitcast <2 x i32> [[VEC_INS]] to <8 x i8>
-; CHECK-NEXT:    [[E_1:%.*]] = zext <8 x i8> [[VEC_BC]] to <8 x i16>
-; CHECK-NEXT:    [[VEC_SHUFFLE:%.*]] = shufflevector <8 x i16> [[E_1]], <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[EXT_2:%.*]] = zext nneg <4 x i16> [[VEC_SHUFFLE]] to <4 x i32>
+; CHECK-NEXT:    [[VEC_SHUFFLE:%.*]] = shufflevector <8 x i8> [[VEC_BC]], <8 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[E_1:%.*]] = zext <4 x i8> [[VEC_SHUFFLE]] to <4 x i16>
+; CHECK-NEXT:    [[EXT_2:%.*]] = zext nneg <4 x i16> [[E_1]] to <4 x i32>
 ; CHECK-NEXT:    call void @use.v8i8(<8 x i8> [[VEC_BC]])
 ; CHECK-NEXT:    ret <4 x i32> [[EXT_2]]
 ;
@@ -266,10 +266,10 @@ define <4 x i32> @load_i32_zext_to_v4i32_shuffle_other_users(ptr %di) {
 ; CHECK-NEXT:    [[L:%.*]] = load i32, ptr [[DI]], align 4
 ; CHECK-NEXT:    [[VEC_INS:%.*]] = insertelement <2 x i32> <i32 poison, i32 0>, i32 [[L]], i64 0
 ; CHECK-NEXT:    [[VEC_BC:%.*]] = bitcast <2 x i32> [[VEC_INS]] to <8 x i8>
-; CHECK-NEXT:    [[E_1:%.*]] = zext <8 x i8> [[VEC_BC]] to <8 x i16>
-; CHECK-NEXT:    [[VEC_SHUFFLE:%.*]] = shufflevector <8 x i16> [[E_1]], <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[EXT_2:%.*]] = zext nneg <4 x i16> [[VEC_SHUFFLE]] to <4 x i32>
-; CHECK-NEXT:    call void @use.v8i16(<4 x i16> [[VEC_SHUFFLE]])
+; CHECK-NEXT:    [[VEC_SHUFFLE:%.*]] = shufflevector <8 x i8> [[VEC_BC]], <8 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[E_1:%.*]] = zext <4 x i8> [[VEC_SHUFFLE]] to <4 x i16>
+; CHECK-NEXT:    [[EXT_2:%.*]] = zext nneg <4 x i16> [[E_1]] to <4 x i32>
+; CHECK-NEXT:    call void @use.v8i16(<4 x i16> [[E_1]])
 ; CHECK-NEXT:    ret <4 x i32> [[EXT_2]]
 ;
 entry:
@@ -290,9 +290,9 @@ define <8 x i32> @load_i64_zext_to_v8i32(ptr %di) {
 ; CHECK-NEXT:    [[L:%.*]] = load i64, ptr [[DI]], align 8
 ; CHECK-NEXT:    [[VEC_INS:%.*]] = insertelement <2 x i64> <i64 poison, i64 0>, i64 [[L]], i64 0
 ; CHECK-NEXT:    [[VEC_BC:%.*]] = bitcast <2 x i64> [[VEC_INS]] to <16 x i8>
-; CHECK-NEXT:    [[EXT_1:%.*]] = zext <16 x i8> [[VEC_BC]] to <16 x i16>
-; CHECK-NEXT:    [[VEC_SHUFFLE:%.*]] = shufflevector <16 x i16> [[EXT_1]], <16 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[OUTER_EXT:%.*]] = zext nneg <8 x i16> [[VEC_SHUFFLE]] to <8 x i32>
+; CHECK-NEXT:    [[VEC_SHUFFLE:%.*]] = shufflevector <16 x i8> [[VEC_BC]], <16 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[EXT_1:%.*]] = zext <8 x i8> [[VEC_SHUFFLE]] to <8 x i16>
+; CHECK-NEXT:    [[OUTER_EXT:%.*]] = zext nneg <8 x i16> [[EXT_1]] to <8 x i32>
 ; CHECK-NEXT:    ret <8 x i32> [[OUTER_EXT]]
 ;
 entry:
@@ -312,9 +312,9 @@ define <3 x i32> @load_i24_zext_to_v3i32(ptr %di) {
 ; CHECK-NEXT:    [[L:%.*]] = load i24, ptr [[DI]], align 4
 ; CHECK-NEXT:    [[VEC_INS:%.*]] = insertelement <2 x i24> <i24 poison, i24 0>, i24 [[L]], i64 0
 ; CHECK-NEXT:    [[VEC_BC:%.*]] = bitcast <2 x i24> [[VEC_INS]] to <6 x i8>
-; CHECK-NEXT:    [[EXT_1:%.*]] = zext <6 x i8> [[VEC_BC]] to <6 x i16>
-; CHECK-NEXT:    [[VEC_SHUFFLE:%.*]] = shufflevector <6 x i16> [[EXT_1]], <6 x i16> poison, <3 x i32> <i32 0, i32 1, i32 2>
-; CHECK-NEXT:    [[EXT_2:%.*]] = zext nneg <3 x i16> [[VEC_SHUFFLE]] to <3 x i32>
+; CHECK-NEXT:    [[VEC_SHUFFLE:%.*]] = shufflevector <6 x i8> [[VEC_BC]], <6 x i8> poison, <3 x i32> <i32 0, i32 1, i32 2>
+; CHECK-NEXT:    [[EXT_1:%.*]] = zext <3 x i8> [[VEC_SHUFFLE]] to <3 x i16>
+; CHECK-NEXT:    [[EXT_2:%.*]] = zext nneg <3 x i16> [[EXT_1]] to <3 x i32>
 ; CHECK-NEXT:    ret <3 x i32> [[EXT_2]]
 ;
 entry:
@@ -334,9 +334,9 @@ define <4 x i32> @load_i32_insert_idx_1_sext(ptr %di) {
 ; CHECK-NEXT:    [[L:%.*]] = load i32, ptr [[DI]], align 4
 ; CHECK-NEXT:    [[VEC_INS:%.*]] = insertelement <2 x i32> <i32 0, i32 poison>, i32 [[L]], i64 1
 ; CHECK-NEXT:    [[VEC_BC:%.*]] = bitcast <2 x i32> [[VEC_INS]] to <8 x i8>
-; CHECK-NEXT:    [[EXT_1:%.*]] = zext <8 x i8> [[VEC_BC]] to <8 x i16>
-; CHECK-NEXT:    [[VEC_SHUFFLE:%.*]] = shufflevector <8 x i16> [[EXT_1]], <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[EXT_2:%.*]] = zext nneg <4 x i16> [[VEC_SHUFFLE]] to <4 x i32>
+; CHECK-NEXT:    [[VEC_SHUFFLE:%.*]] = shufflevector <8 x i8> [[VEC_BC]], <8 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[EXT_1:%.*]] = zext <4 x i8> [[VEC_SHUFFLE]] to <4 x i16>
+; CHECK-NEXT:    [[EXT_2:%.*]] = zext nneg <4 x i16> [[EXT_1]] to <4 x i32>
 ; CHECK-NEXT:    ret <4 x i32> [[EXT_2]]
 ;
 entry:
@@ -356,9 +356,9 @@ define <4 x i32> @mask_extracts_not_all_elements_1_sext(ptr %di) {
 ; CHECK-NEXT:    [[L:%.*]] = load i32, ptr [[DI]], align 4
 ; CHECK-NEXT:    [[VEC_INS:%.*]] = insertelement <2 x i32> <i32 poison, i32 0>, i32 [[L]], i64 0
 ; CHECK-NEXT:    [[VEC_BC:%.*]] = bitcast <2 x i32> [[VEC_INS]] to <8 x i8>
-; CHECK-NEXT:    [[EXT_1:%.*]] = zext <8 x i8> [[VEC_BC]] to <8 x i16>
-; CHECK-NEXT:    [[VEC_SHUFFLE:%.*]] = shufflevector <8 x i16> [[EXT_1]], <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
-; CHECK-NEXT:    [[EXT_2:%.*]] = zext nneg <4 x i16> [[VEC_SHUFFLE]] to <4 x i32>
+; CHECK-NEXT:    [[VEC_SHUFFLE:%.*]] = shufflevector <8 x i8> [[VEC_BC]], <8 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
+; CHECK-NEXT:    [[EXT_1:%.*]] = zext <4 x i8> [[VEC_SHUFFLE]] to <4 x i16>
+; CHECK-NEXT:    [[EXT_2:%.*]] = zext nneg <4 x i16> [[EXT_1]] to <4 x i32>
 ; CHECK-NEXT:    ret <4 x i32> [[EXT_2]]
 ;
 entry:
@@ -378,9 +378,9 @@ define <4 x i32> @mask_extracts_not_all_elements_2_sext(ptr %di) {
 ; CHECK-NEXT:    [[L:%.*]] = load i32, ptr [[DI]], align 4
 ; CHECK-NEXT:    [[VEC_INS:%.*]] = insertelement <2 x i32> <i32 poison, i32 0>, i32 [[L]], i64 0
 ; CHECK-NEXT:    [[VEC_BC:%.*]] = bitcast <2 x i32> [[VEC_INS]] to <8 x i8>
-; CHECK-NEXT:    [[EXT_1:%.*]] = zext <8 x i8> [[VEC_BC]] to <8 x i16>
-; CHECK-NEXT:    [[VEC_SHUFFLE:%.*]] = shufflevector <8 x i16> [[EXT_1]], <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
-; CHECK-NEXT:    [[EXT_2:%.*]] = zext nneg <4 x i16> [[VEC_SHUFFLE]] to <4 x i32>
+; CHECK-NEXT:    [[VEC_SHUFFLE:%.*]] = shufflevector <8 x i8> [[VEC_BC]], <8 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
+; CHECK-NEXT:    [[EXT_1:%.*]] = zext <4 x i8> [[VEC_SHUFFLE]] to <4 x i16>
+; CHECK-NEXT:    [[EXT_2:%.*]] = zext nneg <4 x i16> [[EXT_1]] to <4 x i32>
 ; CHECK-NEXT:    ret <4 x i32> [[EXT_2]]
 ;
 entry:
@@ -422,9 +422,9 @@ define <4 x i32> @load_i32_sext_to_v4i32(ptr %di) {
 ; CHECK-NEXT:    [[L:%.*]] = load i32, ptr [[DI]], align 4
 ; CHECK-NEXT:    [[VEC_INS:%.*]] = insertelement <2 x i32> <i32 poison, i32 0>, i32 [[L]], i64 0
 ; CHECK-NEXT:    [[VEC_BC:%.*]] = bitcast <2 x i32> [[VEC_INS]] to <8 x i8>
-; CHECK-NEXT:    [[E_1:%.*]] = sext <8 x i8> [[VEC_BC]] to <8 x i16>
-; CHECK-NEXT:    [[VEC_SHUFFLE:%.*]] = shufflevector <8 x i16> [[E_1]], <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[EXT_2:%.*]] = sext <4 x i16> [[VEC_SHUFFLE]] to <4 x i32>
+; CHECK-NEXT:    [[VEC_SHUFFLE:%.*]] = shufflevector <8 x i8> [[VEC_BC]], <8 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[E_1:%.*]] = sext <4 x i8> [[VEC_SHUFFLE]] to <4 x i16>
+; CHECK-NEXT:    [[EXT_2:%.*]] = sext <4 x i16> [[E_1]] to <4 x i32>
 ; CHECK-NEXT:    ret <4 x i32> [[EXT_2]]
 ;
 entry:
@@ -444,9 +444,9 @@ define <8 x i32> @load_i64_sext_to_v8i32(ptr %di) {
 ; CHECK-NEXT:    [[L:%.*]] = load i64, ptr [[DI]], align 8
 ; CHECK-NEXT:    [[VEC_INS:%.*]] = insertelement <2 x i64> <i64 poison, i64 0>, i64 [[L]], i64 0
 ; CHECK-NEXT:    [[VEC_BC:%.*]] = bitcast <2 x i64> [[VEC_INS]] to <16 x i8>
-; CHECK-NEXT:    [[EXT_1:%.*]] = sext <16 x i8> [[VEC_BC]] to <16 x i16>
-; CHECK-NEXT:    [[VEC_SHUFFLE:%.*]] = shufflevector <16 x i16> [[EXT_1]], <16 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[OUTER_EXT:%.*]] = sext <8 x i16> [[VEC_SHUFFLE]] to <8 x i32>
+; CHECK-NEXT:    [[VEC_SHUFFLE:%.*]] = shufflevector <16 x i8> [[VEC_BC]], <16 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[EXT_1:%.*]] = sext <8 x i8> [[VEC_SHUFFLE]] to <8 x i16>
+; CHECK-NEXT:    [[OUTER_EXT:%.*]] = sext <8 x i16> [[EXT_1]] to <8 x i32>
 ; CHECK-NEXT:    ret <8 x i32> [[OUTER_EXT]]
 ;
 entry:
@@ -466,9 +466,9 @@ define <3 x i32> @load_i24_sext_to_v3i32(ptr %di) {
 ; CHECK-NEXT:    [[L:%.*]] = load i24, ptr [[DI]], align 4
 ; CHECK-NEXT:    [[VEC_INS:%.*]] = insertelement <2 x i24> <i24 poison, i24 0>, i24 [[L]], i64 0
 ; CHECK-NEXT:    [[VEC_BC:%.*]] = bitcast <2 x i24> [[VEC_INS]] to <6 x i8>
-; CHECK-NEXT:    [[EXT_1:%.*]] = sext <6 x i8> [[VEC_BC]] to <6 x i16>
-; CHECK-NEXT:    [[VEC_SHUFFLE:%.*]] = shufflevector <6 x i16> [[EXT_1]], <6 x i16> poison, <3 x i32> <i32 0, i32 1, i32 2>
-; CHECK-NEXT:    [[EXT_2:%.*]] = sext <3 x i16> [[VEC_SHUFFLE]] to <3 x i32>
+; CHECK-NEXT:    [[VEC_SHUFFLE:%.*]] = shufflevector <6 x i8> [[VEC_BC]], <6 x i8> poison, <3 x i32> <i32 0, i32 1, i32 2>
+; CHECK-NEXT:    [[EXT_1:%.*]] = sext <3 x i8> [[VEC_SHUFFLE]] to <3 x i16>
+; CHECK-NEXT:    [[EXT_2:%.*]] = sext <3 x i16> [[EXT_1]] to <3 x i32>
 ; CHECK-NEXT:    ret <3 x i32> [[EXT_2]]
 ;
 entry:
@@ -488,9 +488,9 @@ define <4 x i32> @load_i32_insert_idx_1(ptr %di) {
 ; CHECK-NEXT:    [[L:%.*]] = load i32, ptr [[DI]], align 4
 ; CHECK-NEXT:    [[VEC_INS:%.*]] = insertelement <2 x i32> <i32 0, i32 poison>, i32 [[L]], i64 1
 ; CHECK-NEXT:    [[VEC_BC:%.*]] = bitcast <2 x i32> [[VEC_INS]] to <8 x i8>
-; CHECK-NEXT:    [[EXT_1:%.*]] = sext <8 x i8> [[VEC_BC]] to <8 x i16>
-; CHECK-NEXT:    [[VEC_SHUFFLE:%.*]] = shufflevector <8 x i16> [[EXT_1]], <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[EXT_2:%.*]] = sext <4 x i16> [[VEC_SHUFFLE]] to <4 x i32>
+; CHECK-NEXT:    [[VEC_SHUFFLE:%.*]] = shufflevector <8 x i8> [[VEC_BC]], <8 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[EXT_1:%.*]] = sext <4 x i8> [[VEC_SHUFFLE]] to <4 x i16>
+; CHECK-NEXT:    [[EXT_2:%.*]] = sext <4 x i16> [[EXT_1]] to <4 x i32>
 ; CHECK-NEXT:    ret <4 x i32> [[EXT_2]]
 ;
 entry:
@@ -510,9 +510,9 @@ define <4 x i32> @mask_extracts_not_all_elements_1(ptr %di) {
 ; CHECK-NEXT:    [[L:%.*]] = load i32, ptr [[DI]], align 4
 ; CHECK-NEXT:    [[VEC_INS:%.*]] = insertelement <2 x i32> <i32 poison, i32 0>, i32 [[L]], i64 0
 ; CHECK-NEXT:    [[VEC_BC:%.*]] = bitcast <2 x i32> [[VEC_INS]] to <8 x i8>
-; CHECK-NEXT:    [[EXT_1:%.*]] = sext <8 x i8> [[VEC_BC]] to <8 x i16>
-; CHECK-NEXT:    [[VEC_SHUFFLE:%.*]] = shufflevector <8 x i16> [[EXT_1]], <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
-; CHECK-NEXT:    [[EXT_2:%.*]] = sext <4 x i16> [[VEC_SHUFFLE]] to <4 x i32>
+; CHECK-NEXT:    [[VEC_SHUFFLE:%.*]] = shufflevector <8 x i8> [[VEC_BC]], <8 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
+; CHECK-NEXT:    [[EXT_1:%.*]] = sext <4 x i8> [[VEC_SHUFFLE]] to <4 x i16>
+; CHECK-NEXT:    [[EXT_2:%.*]] = sext <4 x i16> [[EXT_1]] to <4 x i32>
 ; CHECK-NEXT:    ret <4 x i32> [[EXT_2]]
 ;
 entry:
@@ -532,9 +532,9 @@ define <4 x i32> @mask_extracts_not_all_elements_2(ptr %di) {
 ; CHECK-NEXT:    [[L:%.*]] = load i32, ptr [[DI]], align 4
 ; CHECK-NEXT:    [[VEC_INS:%.*]] = insertelement <2 x i32> <i32 poison, i32 0>, i32 [[L]], i64 0
 ; CHECK-NEXT:    [[VEC_BC:%.*]] = bitcast <2 x i32> [[VEC_INS]] to <8 x i8>
-; CHECK-NEXT:    [[EXT_1:%.*]] = sext <8 x i8> [[VEC_BC]] to <8 x i16>
-; CHECK-NEXT:    [[VEC_SHUFFLE:%.*]] = shufflevector <8 x i16> [[EXT_1]], <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
-; CHECK-NEXT:    [[EXT_2:%.*]] = sext <4 x i16> [[VEC_SHUFFLE]] to <4 x i32>
+; CHECK-NEXT:    [[VEC_SHUFFLE:%.*]] = shufflevector <8 x i8> [[VEC_BC]], <8 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
+; CHECK-NEXT:    [[EXT_1:%.*]] = sext <4 x i8> [[VEC_SHUFFLE]] to <4 x i16>
+; CHECK-NEXT:    [[EXT_2:%.*]] = sext <4 x i16> [[EXT_1]] to <4 x i32>
 ; CHECK-NEXT:    ret <4 x i32> [[EXT_2]]
 ;
 entry:
diff --git a/llvm/test/Transforms/VectorCombine/AArch64/shuffletoidentity.ll b/llvm/test/Transforms/VectorCombine/AArch64/shuffletoidentity.ll
index acbc836..ed29719 100644
--- a/llvm/test/Transforms/VectorCombine/AArch64/shuffletoidentity.ll
+++ b/llvm/test/Transforms/VectorCombine/AArch64/shuffletoidentity.ll
@@ -205,8 +205,8 @@ define <8 x i8> @abs_different(<8 x i8> %a) {
 define <4 x i32> @poison_intrinsic(<2 x i16> %l256) {
 ; CHECK-LABEL: @poison_intrinsic(
 ; CHECK-NEXT:    [[L266:%.*]] = call <2 x i16> @llvm.abs.v2i16(<2 x i16> [[L256:%.*]], i1 false)
-; CHECK-NEXT:    [[L267:%.*]] = zext <2 x i16> [[L266]] to <2 x i32>
-; CHECK-NEXT:    [[L271:%.*]] = shufflevector <2 x i32> [[L267]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[L267:%.*]] = shufflevector <2 x i16> [[L266]], <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[L271:%.*]] = zext <4 x i16> [[L267]] to <4 x i32>
 ; CHECK-NEXT:    ret <4 x i32> [[L271]]
 ;
   %l266 = call <2 x i16> @llvm.abs.v2i16(<2 x i16> %l256, i1 false)
@@ -534,9 +534,9 @@ define <4 x i64> @single_zext(<4 x i32> %x) {
 
 define <4 x i64> @not_zext(<4 x i32> %x) {
 ; CHECK-LABEL: @not_zext(
-; CHECK-NEXT:    [[ZEXT:%.*]] = zext <4 x i32> [[X:%.*]] to <4 x i64>
-; CHECK-NEXT:    [[REVSHUF:%.*]] = shufflevector <4 x i64> [[ZEXT]], <4 x i64> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    ret <4 x i64> [[REVSHUF]]
+; CHECK-NEXT:    [[REVSHUF:%.*]] = shufflevector <4 x i32> [[X]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[ZEXT:%.*]] = zext <4 x i32> [[REVSHUF:%.*]] to <4 x i64>
+; CHECK-NEXT:    ret <4 x i64> [[ZEXT]]
 ;
   %zext = zext <4 x i32> %x to <4 x i64>
   %revshuf = shufflevector <4 x i64> %zext, <4 x i64> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
@@ -922,10 +922,9 @@ define <4 x i8> @singleop(<4 x i8> %a, <4 x i8> %b) {
 
 define <4 x i64> @cast_mismatched_types(<4 x i32> %x) {
 ; CHECK-LABEL: @cast_mismatched_types(
-; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> poison, <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:    [[ZEXT:%.*]] = zext <2 x i32> [[SHUF]] to <2 x i64>
-; CHECK-NEXT:    [[EXTSHUF:%.*]] = shufflevector <2 x i64> [[ZEXT]], <2 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
-; CHECK-NEXT:    ret <4 x i64> [[EXTSHUF]]
+; CHECK-SAME: <4 x i32> [[X:%.*]]) {
+; CHECK-NEXT:    [[ZEXT:%.*]] = zext <4 x i32> [[X]] to <4 x i64>
+; CHECK-NEXT:    ret <4 x i64> [[ZEXT]]
 ;
   %shuf = shufflevector <4 x i32> %x, <4 x i32> poison, <2 x i32> <i32 0, i32 2>
   %zext = zext <2 x i32> %shuf to <2 x i64>
diff --git a/llvm/test/Transforms/VectorCombine/AMDGPU/narrow-phi-of-shuffles.ll b/llvm/test/Transforms/VectorCombine/AMDGPU/narrow-phi-of-shuffles.ll
index 8c50484..b293976 100644
--- a/llvm/test/Transforms/VectorCombine/AMDGPU/narrow-phi-of-shuffles.ll
+++ b/llvm/test/Transforms/VectorCombine/AMDGPU/narrow-phi-of-shuffles.ll
@@ -392,7 +392,7 @@ define <4 x i32> @shuffle_v4i32(<3 x i32> %arg0, i1 %cond) {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]]
 ; CHECK:       [[THEN]]:
-; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <3 x i32> [[ARG0]], <3 x i32> poison, <3 x i32> <i32 1, i32 2, i32 0>
+; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <3 x i32> [[ARG0]], <3 x i32> poison, <3 x i32> <i32 2, i32 0, i32 1>
 ; CHECK-NEXT:    tail call void @func0()
 ; CHECK-NEXT:    br label %[[FINALLY:.*]]
 ; CHECK:       [[ELSE]]:
@@ -427,7 +427,7 @@ define <8 x i32> @shuffle_v8i32(<3 x i32> %arg0, i1 %cond) {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]]
 ; CHECK:       [[THEN]]:
-; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <3 x i32> [[ARG0]], <3 x i32> poison, <3 x i32> <i32 1, i32 2, i32 0>
+; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <3 x i32> [[ARG0]], <3 x i32> poison, <3 x i32> <i32 2, i32 0, i32 1>
 ; CHECK-NEXT:    tail call void @func0()
 ; CHECK-NEXT:    br label %[[FINALLY:.*]]
 ; CHECK:       [[ELSE]]:
@@ -462,7 +462,7 @@ define <16 x i32> @shuffle_v16i32(<3 x i32> %arg0, i1 %cond) {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]]
 ; CHECK:       [[THEN]]:
-; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <3 x i32> [[ARG0]], <3 x i32> poison, <3 x i32> <i32 1, i32 2, i32 0>
+; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <3 x i32> [[ARG0]], <3 x i32> poison, <3 x i32> <i32 2, i32 0, i32 1>
 ; CHECK-NEXT:    tail call void @func0()
 ; CHECK-NEXT:    br label %[[FINALLY:.*]]
 ; CHECK:       [[ELSE]]:
@@ -497,7 +497,7 @@ define <32 x i32> @shuffle_v32i32(<3 x i32> %arg0, i1 %cond) {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]]
 ; CHECK:       [[THEN]]:
-; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <3 x i32> [[ARG0]], <3 x i32> poison, <3 x i32> <i32 1, i32 2, i32 0>
+; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <3 x i32> [[ARG0]], <3 x i32> poison, <3 x i32> <i32 2, i32 0, i32 1>
 ; CHECK-NEXT:    tail call void @func0()
 ; CHECK-NEXT:    br label %[[FINALLY:.*]]
 ; CHECK:       [[ELSE]]:
@@ -1092,7 +1092,7 @@ define <4 x float> @shuffle_v4f32(<3 x float> %arg0, i1 %cond) {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]]
 ; CHECK:       [[THEN]]:
-; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <3 x float> [[ARG0]], <3 x float> poison, <3 x i32> <i32 1, i32 2, i32 0>
+; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <3 x float> [[ARG0]], <3 x float> poison, <3 x i32> <i32 2, i32 0, i32 1>
 ; CHECK-NEXT:    tail call void @func0()
 ; CHECK-NEXT:    br label %[[FINALLY:.*]]
 ; CHECK:       [[ELSE]]:
@@ -1127,7 +1127,7 @@ define <6 x float> @shuffle_v6f32(<3 x float> %arg0, i1 %cond) {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]]
 ; CHECK:       [[THEN]]:
-; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <3 x float> [[ARG0]], <3 x float> poison, <3 x i32> <i32 1, i32 2, i32 0>
+; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <3 x float> [[ARG0]], <3 x float> poison, <3 x i32> <i32 2, i32 0, i32 1>
 ; CHECK-NEXT:    tail call void @func0()
 ; CHECK-NEXT:    br label %[[FINALLY:.*]]
 ; CHECK:       [[ELSE]]:
@@ -1162,7 +1162,7 @@ define <8 x float> @shuffle_v8f32(<3 x float> %arg0, i1 %cond) {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]]
 ; CHECK:       [[THEN]]:
-; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <3 x float> [[ARG0]], <3 x float> poison, <3 x i32> <i32 1, i32 2, i32 0>
+; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <3 x float> [[ARG0]], <3 x float> poison, <3 x i32> <i32 2, i32 0, i32 1>
 ; CHECK-NEXT:    tail call void @func0()
 ; CHECK-NEXT:    br label %[[FINALLY:.*]]
 ; CHECK:       [[ELSE]]:
@@ -1197,7 +1197,7 @@ define <16 x float> @shuffle_v16f32(<3 x float> %arg0, i1 %cond) {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]]
 ; CHECK:       [[THEN]]:
-; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <3 x float> [[ARG0]], <3 x float> poison, <3 x i32> <i32 1, i32 2, i32 0>
+; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <3 x float> [[ARG0]], <3 x float> poison, <3 x i32> <i32 2, i32 0, i32 1>
 ; CHECK-NEXT:    tail call void @func0()
 ; CHECK-NEXT:    br label %[[FINALLY:.*]]
 ; CHECK:       [[ELSE]]:
@@ -1232,7 +1232,7 @@ define <32 x float> @shuffle_v32f32(<3 x float> %arg0, i1 %cond) {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]]
 ; CHECK:       [[THEN]]:
-; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <3 x float> [[ARG0]], <3 x float> poison, <3 x i32> <i32 1, i32 2, i32 0>
+; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <3 x float> [[ARG0]], <3 x float> poison, <3 x i32> <i32 2, i32 0, i32 1>
 ; CHECK-NEXT:    tail call void @func0()
 ; CHECK-NEXT:    br label %[[FINALLY:.*]]
 ; CHECK:       [[ELSE]]:
diff --git a/llvm/test/Transforms/VectorCombine/X86/narrow-phi-of-shuffles.ll b/llvm/test/Transforms/VectorCombine/X86/narrow-phi-of-shuffles.ll
index 59422e9..594017e 100644
--- a/llvm/test/Transforms/VectorCombine/X86/narrow-phi-of-shuffles.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/narrow-phi-of-shuffles.ll
@@ -605,7 +605,7 @@ define <4 x bfloat> @shuffle_v4bf16(<3 x bfloat> %arg0, i1 %cond) {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]]
 ; CHECK:       [[THEN]]:
-; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <3 x bfloat> [[ARG0]], <3 x bfloat> poison, <3 x i32> <i32 1, i32 2, i32 0>
+; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <3 x bfloat> [[ARG0]], <3 x bfloat> poison, <3 x i32> <i32 2, i32 0, i32 1>
 ; CHECK-NEXT:    tail call void @func0()
 ; CHECK-NEXT:    br label %[[FINALLY:.*]]
 ; CHECK:       [[ELSE]]:
@@ -640,7 +640,7 @@ define <6 x bfloat> @shuffle_v6bf16(<3 x bfloat> %arg0, i1 %cond) {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]]
 ; CHECK:       [[THEN]]:
-; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <3 x bfloat> [[ARG0]], <3 x bfloat> poison, <3 x i32> <i32 1, i32 2, i32 0>
+; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <3 x bfloat> [[ARG0]], <3 x bfloat> poison, <3 x i32> <i32 2, i32 0, i32 1>
 ; CHECK-NEXT:    tail call void @func0()
 ; CHECK-NEXT:    br label %[[FINALLY:.*]]
 ; CHECK:       [[ELSE]]:
@@ -675,7 +675,7 @@ define <8 x bfloat> @shuffle_v8bf16(<3 x bfloat> %arg0, i1 %cond) {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]]
 ; CHECK:       [[THEN]]:
-; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <3 x bfloat> [[ARG0]], <3 x bfloat> poison, <3 x i32> <i32 1, i32 2, i32 0>
+; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <3 x bfloat> [[ARG0]], <3 x bfloat> poison, <3 x i32> <i32 2, i32 0, i32 1>
 ; CHECK-NEXT:    tail call void @func0()
 ; CHECK-NEXT:    br label %[[FINALLY:.*]]
 ; CHECK:       [[ELSE]]:
@@ -710,7 +710,7 @@ define <16 x bfloat> @shuffle_v16bf16(<3 x bfloat> %arg0, i1 %cond) {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]]
 ; CHECK:       [[THEN]]:
-; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <3 x bfloat> [[ARG0]], <3 x bfloat> poison, <3 x i32> <i32 1, i32 2, i32 0>
+; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <3 x bfloat> [[ARG0]], <3 x bfloat> poison, <3 x i32> <i32 2, i32 0, i32 1>
 ; CHECK-NEXT:    tail call void @func0()
 ; CHECK-NEXT:    br label %[[FINALLY:.*]]
 ; CHECK:       [[ELSE]]:
@@ -745,7 +745,7 @@ define <32 x bfloat> @shuffle_v32bf16(<3 x bfloat> %arg0, i1 %cond) {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]]
 ; CHECK:       [[THEN]]:
-; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <3 x bfloat> [[ARG0]], <3 x bfloat> poison, <3 x i32> <i32 1, i32 2, i32 0>
+; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <3 x bfloat> [[ARG0]], <3 x bfloat> poison, <3 x i32> <i32 2, i32 0, i32 1>
 ; CHECK-NEXT:    tail call void @func0()
 ; CHECK-NEXT:    br label %[[FINALLY:.*]]
 ; CHECK:       [[ELSE]]:
@@ -850,7 +850,7 @@ define <4 x half> @shuffle_v4f16(<3 x half> %arg0, i1 %cond) {
 ; CHECK-V1-NEXT:  [[ENTRY:.*:]]
 ; CHECK-V1-NEXT:    br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]]
 ; CHECK-V1:       [[THEN]]:
-; CHECK-V1-NEXT:    [[TMP0:%.*]] = shufflevector <3 x half> [[ARG0]], <3 x half> poison, <3 x i32> <i32 1, i32 2, i32 0>
+; CHECK-V1-NEXT:    [[TMP0:%.*]] = shufflevector <3 x half> [[ARG0]], <3 x half> poison, <3 x i32> <i32 2, i32 0, i32 1>
 ; CHECK-V1-NEXT:    tail call void @func0()
 ; CHECK-V1-NEXT:    br label %[[FINALLY:.*]]
 ; CHECK-V1:       [[ELSE]]:
@@ -866,7 +866,7 @@ define <4 x half> @shuffle_v4f16(<3 x half> %arg0, i1 %cond) {
 ; CHECK-V2-NEXT:  [[ENTRY:.*:]]
 ; CHECK-V2-NEXT:    br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]]
 ; CHECK-V2:       [[THEN]]:
-; CHECK-V2-NEXT:    [[TMP0:%.*]] = shufflevector <3 x half> [[ARG0]], <3 x half> poison, <3 x i32> <i32 1, i32 2, i32 0>
+; CHECK-V2-NEXT:    [[TMP0:%.*]] = shufflevector <3 x half> [[ARG0]], <3 x half> poison, <3 x i32> <i32 2, i32 0, i32 1>
 ; CHECK-V2-NEXT:    tail call void @func0()
 ; CHECK-V2-NEXT:    br label %[[FINALLY:.*]]
 ; CHECK-V2:       [[ELSE]]:
@@ -933,7 +933,7 @@ define <6 x half> @shuffle_v6f16(<3 x half> %arg0, i1 %cond) {
 ; CHECK-V1-NEXT:  [[ENTRY:.*:]]
 ; CHECK-V1-NEXT:    br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]]
 ; CHECK-V1:       [[THEN]]:
-; CHECK-V1-NEXT:    [[TMP0:%.*]] = shufflevector <3 x half> [[ARG0]], <3 x half> poison, <3 x i32> <i32 1, i32 2, i32 0>
+; CHECK-V1-NEXT:    [[TMP0:%.*]] = shufflevector <3 x half> [[ARG0]], <3 x half> poison, <3 x i32> <i32 2, i32 0, i32 1>
 ; CHECK-V1-NEXT:    tail call void @func0()
 ; CHECK-V1-NEXT:    br label %[[FINALLY:.*]]
 ; CHECK-V1:       [[ELSE]]:
@@ -949,7 +949,7 @@ define <6 x half> @shuffle_v6f16(<3 x half> %arg0, i1 %cond) {
 ; CHECK-V2-NEXT:  [[ENTRY:.*:]]
 ; CHECK-V2-NEXT:    br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]]
 ; CHECK-V2:       [[THEN]]:
-; CHECK-V2-NEXT:    [[TMP0:%.*]] = shufflevector <3 x half> [[ARG0]], <3 x half> poison, <3 x i32> <i32 1, i32 2, i32 0>
+; CHECK-V2-NEXT:    [[TMP0:%.*]] = shufflevector <3 x half> [[ARG0]], <3 x half> poison, <3 x i32> <i32 2, i32 0, i32 1>
 ; CHECK-V2-NEXT:    tail call void @func0()
 ; CHECK-V2-NEXT:    br label %[[FINALLY:.*]]
 ; CHECK-V2:       [[ELSE]]:
@@ -1016,7 +1016,7 @@ define <8 x half> @shuffle_v8f16(<3 x half> %arg0, i1 %cond) {
 ; CHECK-V1-NEXT:  [[ENTRY:.*:]]
 ; CHECK-V1-NEXT:    br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]]
 ; CHECK-V1:       [[THEN]]:
-; CHECK-V1-NEXT:    [[TMP0:%.*]] = shufflevector <3 x half> [[ARG0]], <3 x half> poison, <3 x i32> <i32 1, i32 2, i32 0>
+; CHECK-V1-NEXT:    [[TMP0:%.*]] = shufflevector <3 x half> [[ARG0]], <3 x half> poison, <3 x i32> <i32 2, i32 0, i32 1>
 ; CHECK-V1-NEXT:    tail call void @func0()
 ; CHECK-V1-NEXT:    br label %[[FINALLY:.*]]
 ; CHECK-V1:       [[ELSE]]:
@@ -1032,7 +1032,7 @@ define <8 x half> @shuffle_v8f16(<3 x half> %arg0, i1 %cond) {
 ; CHECK-V2-NEXT:  [[ENTRY:.*:]]
 ; CHECK-V2-NEXT:    br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]]
 ; CHECK-V2:       [[THEN]]:
-; CHECK-V2-NEXT:    [[TMP0:%.*]] = shufflevector <3 x half> [[ARG0]], <3 x half> poison, <3 x i32> <i32 1, i32 2, i32 0>
+; CHECK-V2-NEXT:    [[TMP0:%.*]] = shufflevector <3 x half> [[ARG0]], <3 x half> poison, <3 x i32> <i32 2, i32 0, i32 1>
 ; CHECK-V2-NEXT:    tail call void @func0()
 ; CHECK-V2-NEXT:    br label %[[FINALLY:.*]]
 ; CHECK-V2:       [[ELSE]]:
@@ -1099,7 +1099,7 @@ define <16 x half> @shuffle_v16f16(<3 x half> %arg0, i1 %cond) {
 ; CHECK-V1-NEXT:  [[ENTRY:.*:]]
 ; CHECK-V1-NEXT:    br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]]
 ; CHECK-V1:       [[THEN]]:
-; CHECK-V1-NEXT:    [[TMP0:%.*]] = shufflevector <3 x half> [[ARG0]], <3 x half> poison, <3 x i32> <i32 1, i32 2, i32 0>
+; CHECK-V1-NEXT:    [[TMP0:%.*]] = shufflevector <3 x half> [[ARG0]], <3 x half> poison, <3 x i32> <i32 2, i32 0, i32 1>
 ; CHECK-V1-NEXT:    tail call void @func0()
 ; CHECK-V1-NEXT:    br label %[[FINALLY:.*]]
 ; CHECK-V1:       [[ELSE]]:
@@ -1115,7 +1115,7 @@ define <16 x half> @shuffle_v16f16(<3 x half> %arg0, i1 %cond) {
 ; CHECK-V2-NEXT:  [[ENTRY:.*:]]
 ; CHECK-V2-NEXT:    br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]]
 ; CHECK-V2:       [[THEN]]:
-; CHECK-V2-NEXT:    [[TMP0:%.*]] = shufflevector <3 x half> [[ARG0]], <3 x half> poison, <3 x i32> <i32 1, i32 2, i32 0>
+; CHECK-V2-NEXT:    [[TMP0:%.*]] = shufflevector <3 x half> [[ARG0]], <3 x half> poison, <3 x i32> <i32 2, i32 0, i32 1>
 ; CHECK-V2-NEXT:    tail call void @func0()
 ; CHECK-V2-NEXT:    br label %[[FINALLY:.*]]
 ; CHECK-V2:       [[ELSE]]:
@@ -1182,7 +1182,7 @@ define <32 x half> @shuffle_v32f16(<3 x half> %arg0, i1 %cond) {
 ; CHECK-V1-NEXT:  [[ENTRY:.*:]]
 ; CHECK-V1-NEXT:    br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]]
 ; CHECK-V1:       [[THEN]]:
-; CHECK-V1-NEXT:    [[TMP0:%.*]] = shufflevector <3 x half> [[ARG0]], <3 x half> poison, <3 x i32> <i32 1, i32 2, i32 0>
+; CHECK-V1-NEXT:    [[TMP0:%.*]] = shufflevector <3 x half> [[ARG0]], <3 x half> poison, <3 x i32> <i32 2, i32 0, i32 1>
 ; CHECK-V1-NEXT:    tail call void @func0()
 ; CHECK-V1-NEXT:    br label %[[FINALLY:.*]]
 ; CHECK-V1:       [[ELSE]]:
@@ -1198,7 +1198,7 @@ define <32 x half> @shuffle_v32f16(<3 x half> %arg0, i1 %cond) {
 ; CHECK-V2-NEXT:  [[ENTRY:.*:]]
 ; CHECK-V2-NEXT:    br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]]
 ; CHECK-V2:       [[THEN]]:
-; CHECK-V2-NEXT:    [[TMP0:%.*]] = shufflevector <3 x half> [[ARG0]], <3 x half> poison, <3 x i32> <i32 1, i32 2, i32 0>
+; CHECK-V2-NEXT:    [[TMP0:%.*]] = shufflevector <3 x half> [[ARG0]], <3 x half> poison, <3 x i32> <i32 2, i32 0, i32 1>
 ; CHECK-V2-NEXT:    tail call void @func0()
 ; CHECK-V2-NEXT:    br label %[[FINALLY:.*]]
 ; CHECK-V2:       [[ELSE]]:
diff --git a/llvm/test/Transforms/VectorCombine/X86/shuffle-of-casts.ll b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-casts.ll
index fba4b60..82a7399 100644
--- a/llvm/test/Transforms/VectorCombine/X86/shuffle-of-casts.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-casts.ll
@@ -342,3 +342,59 @@ define <16 x i32> @concat_sext_zext_v8i16_v16i32(<8 x i16> %a0, <8 x i16> %a1) {
   %r = shufflevector <8 x i32> %x0, <8 x i32> %x1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   ret <16 x i32> %r
 }
+
+; Unary shuffles
+
+define <4 x i16> @unary_shuffle_zext_v8i8_v4i16(<8 x i8> %a0) {
+; CHECK-LABEL: define <4 x i16> @unary_shuffle_zext_v8i8_v4i16(
+; CHECK-SAME: <8 x i8> [[A0:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[VEC_SHUFFLE:%.*]] = shufflevector <8 x i8> [[A0]], <8 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT: [[X1:%.*]] = zext <4 x i8> [[VEC_SHUFFLE]] to <4 x i16>
+; CHECK-NEXT: ret <4 x i16> [[X1]]
+;
+  %x1 = zext <8 x i8> %a0 to <8 x i16>
+  %vec.shuffle = shufflevector <8 x i16> %x1, <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x i16> %vec.shuffle
+}
+
+define <4 x i16> @unary_shuffle_sext_v8i8_v4i16(<8 x i8> %a0) {
+; CHECK-LABEL: define <4 x i16> @unary_shuffle_sext_v8i8_v4i16(
+; CHECK-SAME: <8 x i8> [[A0:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[VEC_SHUFFLE:%.*]] = shufflevector <8 x i8> [[A0]], <8 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT: [[X1:%.*]] = sext <4 x i8> [[VEC_SHUFFLE]] to <4 x i16>
+; CHECK-NEXT: ret <4 x i16> [[X1]]
+;
+  %x1 = sext <8 x i8> %a0 to <8 x i16>
+  %vec.shuffle = shufflevector <8 x i16> %x1, <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x i16> %vec.shuffle
+}
+
+; negative - avoid loop with foldBitcastOfShuffle
+
+define <2 x i32> @unary_shuffle_bitcast_v8i8_v2i32(<8 x i8> %a0) {
+; CHECK-LABEL: define <2 x i32> @unary_shuffle_bitcast_v8i8_v2i32(
+; CHECK-SAME: <8 x i8> [[A0:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[X1:%.*]] = bitcast <8 x i8> [[A0]] to <2 x i32>
+; CHECK-NEXT: [[VEC_SHUFFLE:%.*]] = shufflevector <2 x i32> [[X1]], <2 x i32> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: ret <2 x i32> [[VEC_SHUFFLE]]
+;
+  %x1 = bitcast <8 x i8> %a0 to <2 x i32>
+  %vec.shuffle = shufflevector <2 x i32> %x1, <2 x i32> poison, <2 x i32> <i32 0, i32 1>
+  ret <2 x i32> %vec.shuffle
+}
+
+; negative - multiuse
+
+define <4 x i16> @unary_shuffle_sext_v8i8_v4i16_multiuse(<8 x i8> %a0, ptr %a1) {
+; CHECK-LABEL: define <4 x i16> @unary_shuffle_sext_v8i8_v4i16_multiuse(
+; CHECK-SAME: <8 x i8> [[A0:%.*]], ptr [[A1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[X1:%.*]] = sext <8 x i8> [[A0]] to <8 x i16>
+; CHECK-NEXT: [[VEC_SHUFFLE:%.*]] = shufflevector <8 x i16> [[X1]], <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT: store <8 x i16> [[X1]], ptr [[A1]], align 16
+; CHECK-NEXT: ret <4 x i16> [[VEC_SHUFFLE]]
+;
+  %x1 = sext <8 x i8> %a0 to <8 x i16>
+  %vec.shuffle = shufflevector <8 x i16> %x1, <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  store <8 x i16> %x1, ptr %a1, align 16
+  ret <4 x i16> %vec.shuffle
+}
diff --git a/llvm/test/Verifier/preallocated-invalid.ll b/llvm/test/Verifier/preallocated-invalid.ll
index 38ed106..2c5aff2 100644
--- a/llvm/test/Verifier/preallocated-invalid.ll
+++ b/llvm/test/Verifier/preallocated-invalid.ll
@@ -65,13 +65,21 @@ define void @preallocated_one_call() {
     ret void
 }
 
-; CHECK: must be a constant
+; CHECK: immarg operand has non-immediate parameter
 define void @preallocated_setup_constant() {
     %ac = call i32 @blackbox()
     %cs = call token @llvm.call.preallocated.setup(i32 %ac)
     ret void
 }
 
+; CHECK: llvm.call.preallocated.alloc arg index must be a constant
+define void @preallocated_arg_constant() {
+    %ac = call i32 @blackbox()
+    %cs = call token @llvm.call.preallocated.setup(i32 3)
+    call token @llvm.call.preallocated.arg(token %cs, i32 %ac)
+    ret void
+}
+
 ; CHECK: must be between 0 and corresponding
 define void @preallocated_setup_arg_index_in_bounds() {
     %cs = call token @llvm.call.preallocated.setup(i32 2)
diff --git a/llvm/test/lit.cfg.py b/llvm/test/lit.cfg.py
index dd3f947..781240a 100644
--- a/llvm/test/lit.cfg.py
+++ b/llvm/test/lit.cfg.py
@@ -48,15 +48,17 @@ config.suffixes = [".ll", ".c", ".test", ".txt", ".s", ".mir", ".yaml", ".spv"]
 # directories.
 config.excludes = ["Inputs", "CMakeLists.txt", "README.txt", "LICENSE.txt"]
 
-# Exclude llvm-reduce tests for profcheck because we substitute the FileCheck
-# binary with a no-op command for profcheck, but llvm-reduce tests have RUN
-# commands of the form llvm-reduce --test FileCheck, which explode if we
-# substitute FileCheck because llvm-reduce expects FileCheck in these tests.
-# It's not really possible to exclude these tests from the command substitution,
-# so we just exclude llvm-reduce tests from this config altogether. This should
-# be fine though as profcheck config tests are mostly concerned with opt.
 if config.enable_profcheck:
-    config.excludes = config.excludes + ["llvm-reduce"]
+    # Exclude llvm-reduce tests for profcheck because we substitute the FileCheck
+    # binary with a no-op command for profcheck, but llvm-reduce tests have RUN
+    # commands of the form llvm-reduce --test FileCheck, which explode if we
+    # substitute FileCheck because llvm-reduce expects FileCheck in these tests.
+    # It's not really possible to exclude these tests from the command substitution,
+    # so we just exclude llvm-reduce tests from this config altogether. This should
+    # be fine though as profcheck config tests are mostly concerned with opt.
+    config.excludes.append("llvm-reduce")
+    # (Issue #161235) Temporarily exclude LoopVectorize.
+    config.excludes.append("LoopVectorize")
 
 # test_source_root: The root path where tests are located.
 config.test_source_root = os.path.dirname(__file__)
diff --git a/llvm/test/tools/llvm-dwarfdump/verify_stmt_seq.yaml b/llvm/test/tools/llvm-dwarfdump/verify_stmt_seq.yaml
index 5312c25..17e91f1 100644
--- a/llvm/test/tools/llvm-dwarfdump/verify_stmt_seq.yaml
+++ b/llvm/test/tools/llvm-dwarfdump/verify_stmt_seq.yaml
@@ -2,9 +2,9 @@
 # Then manually tempered with some of the value of the attribute
 # I hope there are easier ways to construct tests like this.
 
-# RUN: yaml2obj %s -o verify_stmt_seq.o
-# RUN: not llvm-dwarfdump -verify -debug-info verify_stmt_seq.o | FileCheck %s --check-prefix=CHECK_INVALID --implicit-check-not=error:
-# RUN: llvm-dwarfdump -debug-line -verbose -debug-info verify_stmt_seq.o | FileCheck %s --check-prefix=CHECK_DEBUG_LINE
+# RUN: yaml2obj %s -o %t.o
+# RUN: not llvm-dwarfdump -verify -debug-info %t.o | FileCheck %s --check-prefix=CHECK_INVALID --implicit-check-not=error:
+# RUN: llvm-dwarfdump -debug-line -verbose -debug-info %t.o | FileCheck %s --check-prefix=CHECK_DEBUG_LINE
 
 # CHECK_INVALID: error: DW_AT_LLVM_stmt_sequence offset 0x00000000 is not within the line table bounds [0x00000034, 0x000000fd)
 # CHECK_INVALID: DW_AT_LLVM_stmt_sequence [DW_FORM_sec_offset]     (0x00000000)
diff --git a/llvm/test/tools/llvm-lib/sym64-threshold.test b/llvm/test/tools/llvm-lib/sym64-threshold.test
new file mode 100644
index 0000000..76f0a03
--- /dev/null
+++ b/llvm/test/tools/llvm-lib/sym64-threshold.test
@@ -0,0 +1,71 @@
+# RUN: yaml2obj --docnum=1 %s -o %t01234567890234567789.obj
+# RUN: yaml2obj --docnum=2 %s -o %t-ec.obj
+# RUN: env SYM64_THRESHOLD=100 llvm-lib -machine:amd64 -out:%t.lib %t01234567890234567789.obj
+# RUN: llvm-nm --print-armap %t.lib | FileCheck --check-prefix=ARMAP %s
+# ARMAP:      Archive map
+# ARMAP-NEXT: sym
+
+# RUN: env SYM64_THRESHOLD=100 not llvm-lib -machine:arm64x -out:%t-ec.lib %t-ec.obj %t01234567890234567789.obj 2>&1 | FileCheck %s
+# CHECK: Archive is too large: ARM64X does not support archives larger than 4GB
+
+--- !COFF
+header:
+  Machine:         IMAGE_FILE_MACHINE_AMD64
+  Characteristics: [  ]
+sections:
+  - Name:            .text
+    Characteristics: [ IMAGE_SCN_CNT_CODE, IMAGE_SCN_MEM_EXECUTE, IMAGE_SCN_MEM_READ ]
+    Alignment:       4
+    SectionData:     ''
+symbols:
+  - Name:            .text
+    Value:           0
+    SectionNumber:   1
+    SimpleType:      IMAGE_SYM_TYPE_NULL
+    ComplexType:     IMAGE_SYM_DTYPE_NULL
+    StorageClass:    IMAGE_SYM_CLASS_STATIC
+    SectionDefinition:
+      Length:          0
+      NumberOfRelocations: 0
+      NumberOfLinenumbers: 0
+      CheckSum:        0
+      Number:          1
+  - !Symbol
+    Name: sym
+    Value: 0
+    SectionNumber: 1
+    SimpleType: IMAGE_SYM_TYPE_NULL # (0)
+    ComplexType: IMAGE_SYM_DTYPE_FUNCTION # (2)
+    StorageClass: IMAGE_SYM_CLASS_EXTERNAL # (2)
+...
+
+--- !COFF
+header:
+  Machine:         IMAGE_FILE_MACHINE_ARM64
+  Characteristics: [  ]
+sections:
+  - Name:            .text
+    Characteristics: [ IMAGE_SCN_CNT_CODE, IMAGE_SCN_MEM_EXECUTE, IMAGE_SCN_MEM_READ ]
+    Alignment:       4
+    SectionData:     ''
+symbols:
+  - Name:            .text
+    Value:           0
+    SectionNumber:   1
+    SimpleType:      IMAGE_SYM_TYPE_NULL
+    ComplexType:     IMAGE_SYM_DTYPE_NULL
+    StorageClass:    IMAGE_SYM_CLASS_STATIC
+    SectionDefinition:
+      Length:          0
+      NumberOfRelocations: 0
+      NumberOfLinenumbers: 0
+      CheckSum:        0
+      Number:          1
+  - !Symbol
+    Name: sym
+    Value: 0
+    SectionNumber: 1
+    SimpleType: IMAGE_SYM_TYPE_NULL # (0)
+    ComplexType: IMAGE_SYM_DTYPE_FUNCTION # (2)
+    StorageClass: IMAGE_SYM_CLASS_EXTERNAL # (2)
+...
diff --git a/llvm/test/tools/llvm-readobj/ELF/AMDGPU/offloading-fail.test b/llvm/test/tools/llvm-readobj/ELF/AMDGPU/offloading-fail.test
new file mode 100644
index 0000000..391b7ee
--- /dev/null
+++ b/llvm/test/tools/llvm-readobj/ELF/AMDGPU/offloading-fail.test
@@ -0,0 +1,26 @@
+## Test that --offloading with a fatbin works correctly.
+# REQUIRES: amdgpu-registered-target
+
+# RUN: yaml2obj %s -o %t.elf
+# RUN: llvm-readobj --offloading %t.elf 2>&1 | \ 
+# RUN:   FileCheck %s --check-prefix=WARN -DFILE_NAME=%t.elf 
+
+# WARN: warning: '{{.*}}': Stream Error: The stream is too short to perform the requested operation.
+
+--- !ELF
+FileHeader:
+  Class:           ELFCLASS64
+  Data:            ELFDATA2LSB
+  Type:            ET_EXEC
+Sections:
+  - Name:            .hip_fatbin
+    Type:            SHT_PROGBITS
+    AddressAlign:    0x1000
+    Content:         5F5F434C414E475F4F46464C4F41445F42554E444C455F5F0200000000000000001000000000000000000000000000001B0000000000000075782D2D0010000000000000D00F0000000000001F0000000000000068697076342D616D6467636E2D616D642D616D646873612D2D676678393038000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000007F454C460201014003000000000000000300E0000100000000000000000000004000000000000000100C0000000000003005000040003800090040000F000D000600000004000000400000000000000040000000000000004000000000000000F801000000000000F80100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000C008000000000000C008000000000000001000000000000001000000050000000009000000000000001900000000000000190000000000006C000000000000006C00000000000000001000000000000001000000060000007009000000000000702900000000000070290000000000007000000000000000900600000000000000100000000000000100000006000000E009000000000000E039000000000000E039000000000000000000000000000001000000000000000010000000000000020000000600000070090000000000007029000000000000702900000000000070000000000000007000000000000000080000000000000052E574640400000070090000000000007029000000000000702900000000000070000000000000009006000000000000010000000000000051E57464060000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000400000004000000380200000000000038020000000000003802000000000000340500000000000034050000000000000400000000000000070000001D05000020000000414D44475055000083AE616D646873612E6B65726E656C7391DE0012AB2E616770725F636F756E7400A52E61726773DC001085AE2E616464726573735F7370616365A6676C6F62616CA52E6E616D65AA415F642E636F65726365A72E6F666673657400A52E73697A6508AB2E76616C75655F6B696E64AD676C6F62616C5F62756666657285AE2E616464726573735F7370616365A6676C6F62616CA52E6E616D65AA425F642E636F65726365A72E6F666673657408A52E73697A6508AB2E76616C75655F6B696E64AD676C6F62616C5F62756666657284A52E6E616D65A14EA72E6F666673657410A52E73697A6508AB2E76616C75655F6B696E64A862795F76616C756583A72E6F666673657418A52E73697A6504AB2E76616C75655F6B696E64B468696464656E5F626C6F636B5F636F756E745F7883A72E6F66667365741CA52E73697A6504AB2E76616C75655F6B696E64B468696464656E5F626C6F636B5F636F756E745F7983A72E6F666673657420A52E73697A6504AB2E76616C75655F6B696E64B468696464656E5F626C6F636B5F636F756E745F7A83A72E6F666673657424A52E73697A6502AB2E76616C75655F6B696E64B368696464656E5F67726F75705F73697A655F7883A72E6F666673657426A52E73697A6502AB2E76616C75655F6B696E64B368696464656E5F67726F75705F73697A655F7983A72E6F666673657428A52E73697A6502AB2E76616C75655F6B696E64B368696464656E5F67726F75705F73697A655F7A83A72E6F66667365742AA52E73697A6502AB2E76616C75655F6B696E64B268696464656E5F72656D61696E6465725F7883A72E6F66667365742CA52E73697A6502AB2E76616C75655F6B696E64B268696464656E5F72656D61696E6465725F7983A72E6F66667365742EA52E73697A6502AB2E76616C75655F6B696E64B268696464656E5F72656D61696E6465725F7A83A72E6F666673657440A52E73697A6508AB2E76616C75655F6B696E64B668696464656E5F676C6F62616C5F6F66667365745F7883A72E6F666673657448A52E73697A6508AB2E76616C75655F6B696E64B668696464656E5F676C6F62616C5F6F66667365745F7983A72E6F666673657450A52E73697A6508AB2E76616C75655F6B696E64B668696464656E5F676C6F62616C5F6F66667365745F7A83A72E6F666673657458A52E73697A6502AB2E76616C75655F6B696E64B068696464656E5F677269645F64696D73B92E67726F75705F7365676D656E745F66697865645F73697A6500B62E6B65726E6172675F7365676D656E745F616C69676E08B52E6B65726E6172675F7365676D656E745F73697A65CD0118A92E6C616E6775616765A84F70656E434C2043B12E6C616E67756167655F76657273696F6E920200B82E6D61785F666C61745F776F726B67726F75705F73697A65CD0400A52E6E616D65B25F5A3973696D706C65416464506A504B6A6DBB2E707269766174655F7365676D656E745F66697865645F73697A6500AB2E736770725F636F756E740CB12E736770725F7370696C6C5F636F756E7400A72E73796D626F6CB55F5A3973696D706C65416464506A504B6A6D2E6B64B82E756E69666F726D5F776F726B5F67726F75705F73697A6501B32E757365735F64796E616D69635F737461636BC2AB2E766770725F636F756E7404B12E766770725F7370696C6C5F636F756E7400AF2E7761766566726F6E745F73697A6540AD616D646873612E746172676574B9616D6467636E2D616D642D616D646873612D2D676678393038AE616D646873612E76657273696F6E92010200000000000000000000000000000000000000000000000000000000000000010000001203070000190000000000006C000000000000001400000011030600800800000000000040000000000000002A00000011000A00E03900000000000001000000000000000100000001000000010000001A000000000008400000D20001000000360A4A7A5238A4D3F113F4DD04000000040000000200000001000000000000000300000000000000000000000000000000000000005F5A3973696D706C65416464506A504B6A6D005F5A3973696D706C65416464506A504B6A6D2E6B64005F5F6869705F637569645F623730363264386333326134613933330000000000000000000000000000000000000000000000000000000000000000000000180100000000000080100000000000000000000000000000000000000000000000000000000000004000AF008C000000090000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000C20102C02400000002000AC0000000008002027E7FC08CBF07FF0486FFFF0000060406920600006800008FD2820002000302067E0200043203030638008050DC02007F020102067E0000003203030238008050DC00007F03700F8CBF03050468008070DC00027F00000081BF00000000060000000000000070070000000000000B000000000000001800000000000000050000000000000020080000000000000A000000000000004600000000000000F5FEFF6F00000000D0070000000000000400000000000000F807000000000000000000000000000000000000000000004C696E6B65723A20414D44204C4C442031392E302E3000414D4420636C616E672076657273696F6E2031392E302E306769742028202032343231322063393630313665636534313337356462646438663037356266333762643666633333323230376233290000414D4420636C616E672076657273696F6E2031382E302E3067697420287373683A2F2F6765727269746769742F6C696768746E696E672F65632F6C6C766D2D70726F6A65637420616D642D6D61696E6C696E652D6F70656E20323431373620663935303039613166393032313232343865313036333964653837653635636163616338643961372900000000000000000000000000000000000000000000000000460000000002080070290000000000000000000000000000010000001203070000190000000000006C000000000000001400000011030600800800000000000040000000000000002A00000011000A00E0390000000000000100000000000000002E6E6F7465002E64796E73796D002E676E752E68617368002E68617368002E64796E737472002E726F64617461002E74657874002E64796E616D6963002E72656C726F5F70616464696E67002E627373002E636F6D6D656E74002E73796D746162002E7368737472746162002E73747274616200005F5A3973696D706C65416464506A504B6A6D005F5A3973696D706C65416464506A504B6A6D2E6B64005F5F6869705F637569645F62373036326438633332613461393333005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000070000000200000000000000380200000000000038020000000000003405000000000000000000000000000004000000000000000000000000000000070000000B00000002000000000000007007000000000000700700000000000060000000000000000500000001000000080000000000000018000000000000000F000000F6FFFF6F0200000000000000D007000000000000D007000000000000280000000000000002000000000000000800000000000000000000000000000019000000050000000200000000000000F807000000000000F80700000000000028000000000000000200000000000000040000000000000004000000000000001F000000030000000200000000000000200800000000000020080000000000004600000000000000000000000000000001000000000000000000000000000000270000000100000002000000000000008008000000000000800800000000000040000000000000000000000000000000400000000000000000000000000000002F000000010000000600000000000000001900000000000000090000000000006C00000000000000000000000000000000010000000000000000000000000000350000000600000003000000000000007029000000000000700900000000000070000000000000000500000000000000080000000000000010000000000000003E000000080000000300000000000000E029000000000000E00900000000000020060000000000000000000000000000010000000000000000000000000000004D000000080000000300000000000000E039000000000000E0090000000000000100000000000000000000000000000001000000000000000000000000000000520000000100000030000000000000000000000000000000E009000000000000F0000000000000000000000000000000010000000000000001000000000000005B0000000200000000000000000000000000000000000000D00A00000000000078000000000000000E0000000200000008000000000000001800000000000000630000000300000000000000000000000000000000000000480B00000000000075000000000000000000000000000000010000000000000000000000000000006D0000000300000000000000000000000000000000000000BD0B0000000000004F00000000000000000000000000000001000000000000000000000000000000
+  - Name:            .hipFatBinSegment
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x202FD0
+    AddressAlign:    0x8
+    Content:         '465049480100000000102000000000000000000000000000'
+...
diff --git a/llvm/test/tools/llvm-readobj/ELF/AMDGPU/offloading.test b/llvm/test/tools/llvm-readobj/ELF/AMDGPU/offloading.test
new file mode 100644
index 0000000..21ee60d
--- /dev/null
+++ b/llvm/test/tools/llvm-readobj/ELF/AMDGPU/offloading.test
@@ -0,0 +1,27 @@
+## Test that --offloading with a fatbin works correctly.
+# REQUIRES: amdgpu-registered-target
+
+# RUN: yaml2obj %s -o %t.elf
+# RUN: llvm-readobj --offloading %t.elf | \ 
+# RUN:   FileCheck %s -DFILE_NAME=%t.elf
+
+# CHECK:        host-x86_64-unknown-linux--     file://[[FILE_NAME]]#offset=8192&size=0
+# CHECK-NEXT:   hipv4-amdgcn-amd-amdhsa--gfx908 file://[[FILE_NAME]]#offset=8192&size=4048
+
+--- !ELF
+FileHeader:
+  Class:           ELFCLASS64
+  Data:            ELFDATA2LSB
+  Type:            ET_EXEC
+Sections:
+  - Name:            .hip_fatbin
+    Type:            SHT_PROGBITS
+    AddressAlign:    0x1000
+    Content:         5F5F434C414E475F4F46464C4F41445F42554E444C455F5F0200000000000000001000000000000000000000000000001B00000000000000686F73742D7838365F36342D756E6B6E6F776E2D6C696E75782D2D0010000000000000D00F0000000000001F0000000000000068697076342D616D6467636E2D616D642D616D646873612D2D676678393038000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000007F454C460201014003000000000000000300E0000100000000000000000000004000000000000000100C0000000000003005000040003800090040000F000D000600000004000000400000000000000040000000000000004000000000000000F801000000000000F80100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000C008000000000000C008000000000000001000000000000001000000050000000009000000000000001900000000000000190000000000006C000000000000006C00000000000000001000000000000001000000060000007009000000000000702900000000000070290000000000007000000000000000900600000000000000100000000000000100000006000000E009000000000000E039000000000000E039000000000000000000000000000001000000000000000010000000000000020000000600000070090000000000007029000000000000702900000000000070000000000000007000000000000000080000000000000052E574640400000070090000000000007029000000000000702900000000000070000000000000009006000000000000010000000000000051E57464060000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000400000004000000380200000000000038020000000000003802000000000000340500000000000034050000000000000400000000000000070000001D05000020000000414D44475055000083AE616D646873612E6B65726E656C7391DE0012AB2E616770725F636F756E7400A52E61726773DC001085AE2E616464726573735F7370616365A6676C6F62616CA52E6E616D65AA415F642E636F65726365A72E6F666673657400A52E73697A6508AB2E76616C75655F6B696E64AD676C6F62616C5F62756666657285AE2E616464726573735F7370616365A6676C6F62616CA52E6E616D65AA425F642E636F65726365A72E6F666673657408A52E73697A6508AB2E76616C75655F6B696E64AD676C6F62616C5F62756666657284A52E6E616D65A14EA72E6F666673657410A52E73697A6508AB2E76616C75655F6B696E64A862795F76616C756583A72E6F666673657418A52E73697A6504AB2E76616C75655F6B696E64B468696464656E5F626C6F636B5F636F756E745F7883A72E6F66667365741CA52E73697A6504AB2E76616C75655F6B696E64B468696464656E5F626C6F636B5F636F756E745F7983A72E6F666673657420A52E73697A6504AB2E76616C75655F6B696E64B468696464656E5F626C6F636B5F636F756E745F7A83A72E6F666673657424A52E73697A6502AB2E76616C75655F6B696E64B368696464656E5F67726F75705F73697A655F7883A72E6F666673657426A52E73697A6502AB2E76616C75655F6B696E64B368696464656E5F67726F75705F73697A655F7983A72E6F666673657428A52E73697A6502AB2E76616C75655F6B696E64B368696464656E5F67726F75705F73697A655F7A83A72E6F66667365742AA52E73697A6502AB2E76616C75655F6B696E64B268696464656E5F72656D61696E6465725F7883A72E6F66667365742CA52E73697A6502AB2E76616C75655F6B696E64B268696464656E5F72656D61696E6465725F7983A72E6F66667365742EA52E73697A6502AB2E76616C75655F6B696E64B268696464656E5F72656D61696E6465725F7A83A72E6F666673657440A52E73697A6508AB2E76616C75655F6B696E64B668696464656E5F676C6F62616C5F6F66667365745F7883A72E6F666673657448A52E73697A6508AB2E76616C75655F6B696E64B668696464656E5F676C6F62616C5F6F66667365745F7983A72E6F666673657450A52E73697A6508AB2E76616C75655F6B696E64B668696464656E5F676C6F62616C5F6F66667365745F7A83A72E6F666673657458A52E73697A6502AB2E76616C75655F6B696E64B068696464656E5F677269645F64696D73B92E67726F75705F7365676D656E745F66697865645F73697A6500B62E6B65726E6172675F7365676D656E745F616C69676E08B52E6B65726E6172675F7365676D656E745F73697A65CD0118A92E6C616E6775616765A84F70656E434C2043B12E6C616E67756167655F76657273696F6E920200B82E6D61785F666C61745F776F726B67726F75705F73697A65CD0400A52E6E616D65B25F5A3973696D706C65416464506A504B6A6DBB2E707269766174655F7365676D656E745F66697865645F73697A6500AB2E736770725F636F756E740CB12E736770725F7370696C6C5F636F756E7400A72E73796D626F6CB55F5A3973696D706C65416464506A504B6A6D2E6B64B82E756E69666F726D5F776F726B5F67726F75705F73697A6501B32E757365735F64796E616D69635F737461636BC2AB2E766770725F636F756E7404B12E766770725F7370696C6C5F636F756E7400AF2E7761766566726F6E745F73697A6540AD616D646873612E746172676574B9616D6467636E2D616D642D616D646873612D2D676678393038AE616D646873612E76657273696F6E92010200000000000000000000000000000000000000000000000000000000000000010000001203070000190000000000006C000000000000001400000011030600800800000000000040000000000000002A00000011000A00E03900000000000001000000000000000100000001000000010000001A000000000008400000D20001000000360A4A7A5238A4D3F113F4DD04000000040000000200000001000000000000000300000000000000000000000000000000000000005F5A3973696D706C65416464506A504B6A6D005F5A3973696D706C65416464506A504B6A6D2E6B64005F5F6869705F637569645F623730363264386333326134613933330000000000000000000000000000000000000000000000000000000000000000000000180100000000000080100000000000000000000000000000000000000000000000000000000000004000AF008C000000090000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000C20102C02400000002000AC0000000008002027E7FC08CBF07FF0486FFFF0000060406920600006800008FD2820002000302067E0200043203030638008050DC02007F020102067E0000003203030238008050DC00007F03700F8CBF03050468008070DC00027F00000081BF00000000060000000000000070070000000000000B000000000000001800000000000000050000000000000020080000000000000A000000000000004600000000000000F5FEFF6F00000000D0070000000000000400000000000000F807000000000000000000000000000000000000000000004C696E6B65723A20414D44204C4C442031392E302E3000414D4420636C616E672076657273696F6E2031392E302E306769742028202032343231322063393630313665636534313337356462646438663037356266333762643666633333323230376233290000414D4420636C616E672076657273696F6E2031382E302E3067697420287373683A2F2F6765727269746769742F6C696768746E696E672F65632F6C6C766D2D70726F6A65637420616D642D6D61696E6C696E652D6F70656E20323431373620663935303039613166393032313232343865313036333964653837653635636163616338643961372900000000000000000000000000000000000000000000000000460000000002080070290000000000000000000000000000010000001203070000190000000000006C000000000000001400000011030600800800000000000040000000000000002A00000011000A00E0390000000000000100000000000000002E6E6F7465002E64796E73796D002E676E752E68617368002E68617368002E64796E737472002E726F64617461002E74657874002E64796E616D6963002E72656C726F5F70616464696E67002E627373002E636F6D6D656E74002E73796D746162002E7368737472746162002E73747274616200005F5A3973696D706C65416464506A504B6A6D005F5A3973696D706C65416464506A504B6A6D2E6B64005F5F6869705F637569645F62373036326438633332613461393333005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000070000000200000000000000380200000000000038020000000000003405000000000000000000000000000004000000000000000000000000000000070000000B00000002000000000000007007000000000000700700000000000060000000000000000500000001000000080000000000000018000000000000000F000000F6FFFF6F0200000000000000D007000000000000D007000000000000280000000000000002000000000000000800000000000000000000000000000019000000050000000200000000000000F807000000000000F80700000000000028000000000000000200000000000000040000000000000004000000000000001F000000030000000200000000000000200800000000000020080000000000004600000000000000000000000000000001000000000000000000000000000000270000000100000002000000000000008008000000000000800800000000000040000000000000000000000000000000400000000000000000000000000000002F000000010000000600000000000000001900000000000000090000000000006C00000000000000000000000000000000010000000000000000000000000000350000000600000003000000000000007029000000000000700900000000000070000000000000000500000000000000080000000000000010000000000000003E000000080000000300000000000000E029000000000000E00900000000000020060000000000000000000000000000010000000000000000000000000000004D000000080000000300000000000000E039000000000000E0090000000000000100000000000000000000000000000001000000000000000000000000000000520000000100000030000000000000000000000000000000E009000000000000F0000000000000000000000000000000010000000000000001000000000000005B0000000200000000000000000000000000000000000000D00A00000000000078000000000000000E0000000200000008000000000000001800000000000000630000000300000000000000000000000000000000000000480B00000000000075000000000000000000000000000000010000000000000000000000000000006D0000000300000000000000000000000000000000000000BD0B0000000000004F00000000000000000000000000000001000000000000000000000000000000
+  - Name:            .hipFatBinSegment
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x202FD0
+    AddressAlign:    0x8
+    Content:         '465049480100000000102000000000000000000000000000'
+...
diff --git a/llvm/test/tools/llvm-size/macho-pagezero.test b/llvm/test/tools/llvm-size/macho-pagezero.test
new file mode 100644
index 0000000..db69fd0
--- /dev/null
+++ b/llvm/test/tools/llvm-size/macho-pagezero.test
@@ -0,0 +1,108 @@
+## Test the --exclude-pagezero option to skip __PAGEZERO segment in Mach-O files.
+
+# RUN: yaml2obj %s --docnum=1 -o %t-pagezero.o
+# RUN: llvm-size %t-pagezero.o | \
+# RUN:   FileCheck %s --check-prefix=NORMAL --match-full-lines
+# RUN: llvm-size --exclude-pagezero %t-pagezero.o | \
+# RUN:   FileCheck %s --check-prefix=SKIP --match-full-lines
+
+# RUN: yaml2obj %s --docnum=2 -o %t-pagezero32.o
+# RUN: llvm-size %t-pagezero32.o | \
+# RUN:   FileCheck %s --check-prefix=NORMAL --match-full-lines
+# RUN: llvm-size --exclude-pagezero %t-pagezero32.o | \
+# RUN:   FileCheck %s --check-prefix=SKIP --match-full-lines
+
+# NORMAL:__TEXT	__DATA	__OBJC	others	dec	hex
+# NORMAL-NEXT:20	100	0	4096	4216	1078
+
+# SKIP:__TEXT	__DATA	__OBJC	others	dec	hex
+# SKIP-NEXT:20	100	0	0	120	78
+
+--- !mach-o
+FileHeader:
+  magic:           0xFEEDFACF
+  cputype:         0x100000C
+  cpusubtype:      0x0
+  filetype:        0x2
+  ncmds:           3
+  sizeofcmds:      216
+  flags:           0x2000
+  reserved:        0x0
+LoadCommands:
+  - cmd:             LC_SEGMENT_64
+    cmdsize:         72
+    segname:         __PAGEZERO
+    vmaddr:          0x0
+    vmsize:          4096
+    fileoff:         0
+    filesize:        0
+    maxprot:         0
+    initprot:        0
+    nsects:          0
+    flags:           0
+  - cmd:             LC_SEGMENT_64
+    cmdsize:         72
+    segname:         __TEXT
+    vmaddr:          0x100000000
+    vmsize:          20
+    fileoff:         248
+    filesize:        20
+    maxprot:         7
+    initprot:        5
+    nsects:          0
+    flags:           0
+  - cmd:             LC_SEGMENT_64
+    cmdsize:         72
+    segname:         __DATA
+    vmaddr:          0x100001000
+    vmsize:          100
+    fileoff:         268
+    filesize:        100
+    maxprot:         7
+    initprot:        3
+    nsects:          0
+    flags:           0
+
+--- !mach-o
+FileHeader:
+  magic:           0xFEEDFACE
+  cputype:         0x7
+  cpusubtype:      0x3
+  filetype:        0x2
+  ncmds:           3
+  sizeofcmds:      168
+  flags:           0x2000
+LoadCommands:
+  - cmd:             LC_SEGMENT
+    cmdsize:         56
+    segname:         __PAGEZERO
+    vmaddr:          0x0
+    vmsize:          4096
+    fileoff:         0
+    filesize:        0
+    maxprot:         0
+    initprot:        0
+    nsects:          0
+    flags:           0
+  - cmd:             LC_SEGMENT
+    cmdsize:         56
+    segname:         __TEXT
+    vmaddr:          0x1000
+    vmsize:          20
+    fileoff:         196
+    filesize:        20
+    maxprot:         7
+    initprot:        5
+    nsects:          0
+    flags:           0
+  - cmd:             LC_SEGMENT
+    cmdsize:         56
+    segname:         __DATA
+    vmaddr:          0x2000
+    vmsize:          100
+    fileoff:         216
+    filesize:        100
+    maxprot:         7
+    initprot:        3
+    nsects:          0
+    flags:           0
diff --git a/llvm/tools/llvm-cov/CoverageExporterJson.cpp b/llvm/tools/llvm-cov/CoverageExporterJson.cpp
index ff86c8d..4c07c05 100644
--- a/llvm/tools/llvm-cov/CoverageExporterJson.cpp
+++ b/llvm/tools/llvm-cov/CoverageExporterJson.cpp
@@ -118,6 +118,7 @@ json::Value renderCondState(const coverage::MCDCRecord::CondState CondState) {
   case coverage::MCDCRecord::MCDC_False:
     return json::Value(false);
   }
+  llvm_unreachable("Unknown llvm::coverage::MCDCRecord::CondState enum");
 }
 
 json::Array gatherTestVectors(coverage::MCDCRecord &Record) {
diff --git a/llvm/tools/llvm-jitlink/llvm-jitlink.cpp b/llvm/tools/llvm-jitlink/llvm-jitlink.cpp
index 31bf6a9..e09ddb4 100644
--- a/llvm/tools/llvm-jitlink/llvm-jitlink.cpp
+++ b/llvm/tools/llvm-jitlink/llvm-jitlink.cpp
@@ -1519,10 +1519,10 @@ private:
 
 static StringRef detectStubKind(const Session::MemoryRegionInfo &Stub) {
   using namespace support::endian;
-  auto Armv7MovWTle = byte_swap<uint32_t, endianness::little>(0xe300c000);
-  auto Armv7BxR12le = byte_swap<uint32_t, endianness::little>(0xe12fff1c);
-  auto Thumbv7MovWTle = byte_swap<uint32_t, endianness::little>(0x0c00f240);
-  auto Thumbv7BxR12le = byte_swap<uint16_t, endianness::little>(0x4760);
+  auto Armv7MovWTle = byte_swap<uint32_t>(0xe300c000, endianness::little);
+  auto Armv7BxR12le = byte_swap<uint32_t>(0xe12fff1c, endianness::little);
+  auto Thumbv7MovWTle = byte_swap<uint32_t>(0x0c00f240, endianness::little);
+  auto Thumbv7BxR12le = byte_swap<uint16_t>(0x4760, endianness::little);
 
   MemoryMatcher M(Stub.getContent());
   if (M.matchMask(Thumbv7MovWTle)) {
diff --git a/llvm/tools/llvm-readobj/ObjDumper.cpp b/llvm/tools/llvm-readobj/ObjDumper.cpp
index bd670ae..0b59dd4 100644
--- a/llvm/tools/llvm-readobj/ObjDumper.cpp
+++ b/llvm/tools/llvm-readobj/ObjDumper.cpp
@@ -16,6 +16,8 @@
 #include "llvm/Object/Archive.h"
 #include "llvm/Object/Decompressor.h"
 #include "llvm/Object/ObjectFile.h"
+#include "llvm/Object/OffloadBinary.h"
+#include "llvm/Object/OffloadBundle.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/ScopedPrinter.h"
@@ -230,4 +232,14 @@ void ObjDumper::printSectionsAsHex(const object::ObjectFile &Obj,
   }
 }
 
+void ObjDumper::printOffloading(const object::ObjectFile &Obj) {
+  SmallVector<llvm::object::OffloadBundleFatBin> Bundles;
+  if (Error Err = object::extractOffloadBundleFatBinary(Obj, Bundles))
+    reportWarning(std::move(Err), Obj.getFileName());
+
+  // Print out all the FatBin Bundles that are contained in this buffer.
+  for (const auto &[Index, Bundle] : llvm::enumerate(Bundles))
+    Bundle.printEntriesAsURI();
+}
+
 } // namespace llvm
diff --git a/llvm/tools/llvm-readobj/ObjDumper.h b/llvm/tools/llvm-readobj/ObjDumper.h
index a654078..d264394 100644
--- a/llvm/tools/llvm-readobj/ObjDumper.h
+++ b/llvm/tools/llvm-readobj/ObjDumper.h
@@ -16,6 +16,7 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Object/ObjectFile.h"
+#include "llvm/Object/OffloadBinary.h"
 #include "llvm/Support/CommandLine.h"
 
 #include <unordered_set>
@@ -188,6 +189,7 @@ public:
   std::function<Error(const Twine &Msg)> WarningHandler;
   void reportUniqueWarning(Error Err) const;
   void reportUniqueWarning(const Twine &Msg) const;
+  void printOffloading(const object::ObjectFile &Obj);
 
 protected:
   ScopedPrinter &W;
diff --git a/llvm/tools/llvm-readobj/Opts.td b/llvm/tools/llvm-readobj/Opts.td
index 711522c..97d5d7f 100644
--- a/llvm/tools/llvm-readobj/Opts.td
+++ b/llvm/tools/llvm-readobj/Opts.td
@@ -32,6 +32,7 @@ def file_header : FF<"file-header", "Display file header">;
 def headers : FF<"headers", "Equivalent to setting: --file-header, --program-headers, --section-headers">;
 defm hex_dump : Eq<"hex-dump", "Display the specified section(s) as hexadecimal bytes">, MetaVarName<"<name or index>">;
 def pretty_print : FF<"pretty-print", "Pretty print JSON output">;
+def offloading : FF<"offloading", "Display the content of the offloading section">;
 def relocs : FF<"relocs", "Display the relocation entries in the file">;
 def section_data : FF<"section-data", "Display section data for each section shown. This option has no effect for GNU style output">;
 def section_details : FF<"section-details", "Display the section details">;
diff --git a/llvm/tools/llvm-readobj/llvm-readobj.cpp b/llvm/tools/llvm-readobj/llvm-readobj.cpp
index 2b34761..5327731 100644
--- a/llvm/tools/llvm-readobj/llvm-readobj.cpp
+++ b/llvm/tools/llvm-readobj/llvm-readobj.cpp
@@ -135,6 +135,7 @@ static bool HashHistogram;
 static bool Memtag;
 static bool NeededLibraries;
 static bool Notes;
+static bool Offloading;
 static bool ProgramHeaders;
 static bool SectionGroups;
 static std::vector<std::string> SFrame;
@@ -274,6 +275,7 @@ static void parseOptions(const opt::InputArgList &Args) {
   opts::Memtag = Args.hasArg(OPT_memtag);
   opts::NeededLibraries = Args.hasArg(OPT_needed_libs);
   opts::Notes = Args.hasArg(OPT_notes);
+  opts::Offloading = Args.hasArg(OPT_offloading);
   opts::PrettyPrint = Args.hasArg(OPT_pretty_print);
   opts::ProgramHeaders = Args.hasArg(OPT_program_headers);
   opts::SectionGroups = Args.hasArg(OPT_section_groups);
@@ -459,6 +461,8 @@ static void dumpObject(ObjectFile &Obj, ScopedPrinter &Writer,
     Dumper->printGnuHashTable();
   if (opts::VersionInfo)
     Dumper->printVersionInfo();
+  if (opts::Offloading)
+    Dumper->printOffloading(Obj);
   if (opts::StringTable)
     Dumper->printStringTable();
   if (Obj.isELF()) {
@@ -707,6 +711,7 @@ int llvm_readobj_main(int argc, char **argv, const llvm::ToolContext &) {
     opts::DynamicTable = true;
     opts::Notes = true;
     opts::VersionInfo = true;
+    opts::Offloading = true;
     opts::UnwindInfo = true;
     opts::SectionGroups = true;
     opts::HashHistogram = true;
diff --git a/llvm/tools/llvm-size/Opts.td b/llvm/tools/llvm-size/Opts.td
index edae43f..88e39f2 100644
--- a/llvm/tools/llvm-size/Opts.td
+++ b/llvm/tools/llvm-size/Opts.td
@@ -21,6 +21,9 @@ def grp_mach_o : OptionGroup<"kind">, HelpText<"OPTIONS (Mach-O specific)">;
 def arch_EQ : Joined<["--"], "arch=">, HelpText<"architecture(s) from a Mach-O file to dump">, Group<grp_mach_o>;
 def : Separate<["--", "-"], "arch">, Alias<arch_EQ>;
 def l : F<"l", "When format is darwin, use long format to include addresses and offsets">, Group<grp_mach_o>;
+def exclude_pagezero
+    : FF<"exclude-pagezero", "Do not include __PAGEZERO segment in totals">,
+      Group<grp_mach_o>;
 
 def : F<"A", "Alias for --format">, Alias<format_EQ>, AliasArgs<["sysv"]>;
 def : F<"B", "Alias for --format">, Alias<format_EQ>, AliasArgs<["berkeley"]>;
diff --git a/llvm/tools/llvm-size/llvm-size.cpp b/llvm/tools/llvm-size/llvm-size.cpp
index acc7843..ec94db4 100644
--- a/llvm/tools/llvm-size/llvm-size.cpp
+++ b/llvm/tools/llvm-size/llvm-size.cpp
@@ -79,6 +79,7 @@ static bool DarwinLongFormat;
 static RadixTy Radix = RadixTy::decimal;
 static bool TotalSizes;
 static bool HasMachOFiles = false;
+static bool ExcludePageZero = false;
 
 static std::vector<std::string> InputFilenames;
 
@@ -313,7 +314,7 @@ static void printDarwinSegmentSizes(MachOObjectFile *MachO) {
           total_data += Seg.vmsize;
         else if (SegmentName == "__OBJC")
           total_objc += Seg.vmsize;
-        else
+        else if (!ExcludePageZero || SegmentName != "__PAGEZERO")
           total_others += Seg.vmsize;
       }
     } else if (Load.C.cmd == MachO::LC_SEGMENT) {
@@ -339,7 +340,7 @@ static void printDarwinSegmentSizes(MachOObjectFile *MachO) {
           total_data += Seg.vmsize;
         else if (SegmentName == "__OBJC")
           total_objc += Seg.vmsize;
-        else
+        else if (!ExcludePageZero || SegmentName != "__PAGEZERO")
           total_others += Seg.vmsize;
       }
     }
@@ -914,6 +915,7 @@ int llvm_size_main(int argc, char **argv, const llvm::ToolContext &) {
 
   ELFCommons = Args.hasArg(OPT_common);
   DarwinLongFormat = Args.hasArg(OPT_l);
+  ExcludePageZero = Args.hasArg(OPT_exclude_pagezero);
   TotalSizes = Args.hasArg(OPT_totals);
   StringRef V = Args.getLastArgValue(OPT_format_EQ, "berkeley");
   if (V == "berkeley")
diff --git a/llvm/unittests/ADT/EquivalenceClassesTest.cpp b/llvm/unittests/ADT/EquivalenceClassesTest.cpp
index 3d5c48e..8172ff9 100644
--- a/llvm/unittests/ADT/EquivalenceClassesTest.cpp
+++ b/llvm/unittests/ADT/EquivalenceClassesTest.cpp
@@ -108,6 +108,29 @@ TEST(EquivalenceClassesTest, SimpleErase4) {
   EXPECT_FALSE(EqClasses.erase(1));
 }
 
+TEST(EquivalenceClassesTest, EraseKeepsLeaderBit) {
+  EquivalenceClasses<int> EC;
+
+  // Create a set {1, 2} where 1 is the leader.
+  EC.unionSets(1, 2);
+
+  // Verify initial state.
+  EXPECT_EQ(EC.getLeaderValue(2), 1);
+
+  // Erase 2, the non-leader member.
+  EXPECT_TRUE(EC.erase(2));
+
+  // Verify that we have exactly one equivalence class.
+  ASSERT_NE(EC.begin(), EC.end());
+  ASSERT_EQ(std::next(EC.begin()), EC.end());
+
+  // Verify that 1 is still a leader after erasing 2.
+  const auto *Elem = *EC.begin();
+  ASSERT_NE(Elem, nullptr);
+  EXPECT_EQ(Elem->getData(), 1);
+  EXPECT_TRUE(Elem->isLeader()) << "The leader bit was lost!";
+}
+
 TEST(EquivalenceClassesTest, TwoSets) {
   EquivalenceClasses<int> EqClasses;
   // Form sets of odd and even numbers, check that we split them into these
diff --git a/llvm/unittests/ADT/PackedVectorTest.cpp b/llvm/unittests/ADT/PackedVectorTest.cpp
index 30fc7c0..df2cbf0 100644
--- a/llvm/unittests/ADT/PackedVectorTest.cpp
+++ b/llvm/unittests/ADT/PackedVectorTest.cpp
@@ -71,6 +71,14 @@ TEST(PackedVectorTest, RawBitsSize) {
   EXPECT_EQ(12u, Vec.raw_bits().size());
 }
 
+TEST(PackedVectorTest, SignedValueOverwrite) {
+  PackedVector<signed, 4> Vec(1);
+  Vec[0] = -1;
+  EXPECT_EQ(-1, Vec[0]);
+  Vec[0] = 1;
+  EXPECT_EQ(1, Vec[0]);
+}
+
 #ifdef EXPECT_DEBUG_DEATH
 
 TEST(PackedVectorTest, UnsignedValues) {
diff --git a/llvm/unittests/CAS/CMakeLists.txt b/llvm/unittests/CAS/CMakeLists.txt
index ab709e3..0f8fcb9 100644
--- a/llvm/unittests/CAS/CMakeLists.txt
+++ b/llvm/unittests/CAS/CMakeLists.txt
@@ -1,7 +1,3 @@
-if (LLVM_ENABLE_ONDISK_CAS)
-  add_definitions(-DLLVM_ENABLE_ONDISK_CAS=1)
-endif()
-
 set(LLVM_LINK_COMPONENTS
   Support
   CAS
@@ -12,6 +8,7 @@ add_llvm_unittest(CASTests
   ActionCacheTest.cpp
   CASTestConfig.cpp
   ObjectStoreTest.cpp
+  OnDiskTrieRawHashMapTest.cpp
   ProgramTest.cpp
   )
 
diff --git a/llvm/unittests/CAS/OnDiskTrieRawHashMapTest.cpp b/llvm/unittests/CAS/OnDiskTrieRawHashMapTest.cpp
new file mode 100644
index 0000000..7bedfe4
--- /dev/null
+++ b/llvm/unittests/CAS/OnDiskTrieRawHashMapTest.cpp
@@ -0,0 +1,220 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CAS/OnDiskTrieRawHashMap.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/Config/llvm-config.h"
+#include "llvm/Support/Alignment.h"
+#include "llvm/Testing/Support/Error.h"
+#include "llvm/Testing/Support/SupportHelpers.h"
+#include "gtest/gtest.h"
+
+#if LLVM_ENABLE_ONDISK_CAS
+using namespace llvm;
+using namespace llvm::cas;
+
+namespace {
+
+struct OnDiskTrieRawHashMapTestFixture
+    : public ::testing::TestWithParam<size_t> {
+  static constexpr size_t MB = 1024u * 1024u;
+  static constexpr size_t DataSize = 8; // Multiple of 8B.
+
+  std::optional<unittest::TempDir> Temp;
+  size_t NumHashBytes;
+
+  void SetUp() override {
+    Temp.emplace("trie-raw-hash-map", /*Unique=*/true);
+    NumHashBytes = GetParam();
+  }
+  void TearDown() override { Temp.reset(); }
+
+  Expected<OnDiskTrieRawHashMap> createTrie() {
+    size_t NumHashBits = NumHashBytes * 8;
+    return OnDiskTrieRawHashMap::create(
+        Temp->path((Twine(NumHashBytes) + "B").str()), "index",
+        /*NumHashBits=*/NumHashBits, DataSize, /*MaxFileSize=*/MB,
+        /*NewInitialFileSize=*/std::nullopt);
+  }
+};
+
+// Create tries with various sizes of hash and with data.
+TEST_P(OnDiskTrieRawHashMapTestFixture, General) {
+  std::optional<OnDiskTrieRawHashMap> Trie1;
+  ASSERT_THAT_ERROR(createTrie().moveInto(Trie1), Succeeded());
+  std::optional<OnDiskTrieRawHashMap> Trie2;
+  ASSERT_THAT_ERROR(createTrie().moveInto(Trie2), Succeeded());
+
+  uint8_t Hash0Bytes[8] = {0, 0, 0, 0, 0, 0, 0, 0};
+  uint8_t Hash1Bytes[8] = {1, 0, 0, 0, 0, 0, 0, 0};
+  auto Hash0 = ArrayRef(Hash0Bytes).take_front(NumHashBytes);
+  auto Hash1 = ArrayRef(Hash1Bytes).take_front(NumHashBytes);
+  constexpr StringLiteral Data0v1Bytes = "data0.v1";
+  constexpr StringLiteral Data0v2Bytes = "data0.v2";
+  constexpr StringLiteral Data1Bytes = "data1...";
+  static_assert(Data0v1Bytes.size() == DataSize, "math error");
+  static_assert(Data0v2Bytes.size() == DataSize, "math error");
+  static_assert(Data1Bytes.size() == DataSize, "math error");
+  ArrayRef<char> Data0v1 = ArrayRef(Data0v1Bytes.data(), Data0v1Bytes.size());
+  ArrayRef<char> Data0v2 = ArrayRef(Data0v2Bytes.data(), Data0v2Bytes.size());
+  ArrayRef<char> Data1 = ArrayRef(Data1Bytes.data(), Data1Bytes.size());
+
+  // Lookup when trie is empty.
+  EXPECT_FALSE(Trie1->find(Hash0));
+
+  // Insert.
+  std::optional<FileOffset> Offset;
+  std::optional<MutableArrayRef<char>> Data;
+  {
+    std::optional<OnDiskTrieRawHashMap::pointer> Insertion;
+    ASSERT_THAT_ERROR(Trie1->insert({Hash0, Data0v1}).moveInto(Insertion),
+                      Succeeded());
+    EXPECT_EQ(Hash0, (*Insertion)->Hash);
+    EXPECT_EQ(Data0v1, (*Insertion)->Data);
+    EXPECT_TRUE(isAddrAligned(Align(8), (*Insertion)->Data.data()));
+
+    Offset = Insertion->getOffset();
+    Data = (*Insertion)->Data;
+  }
+
+  // Find.
+  {
+    auto Lookup = Trie1->find(Hash0);
+    ASSERT_TRUE(Lookup);
+    EXPECT_EQ(Hash0, Lookup->Hash);
+    EXPECT_EQ(Data0v1, Lookup->Data);
+    EXPECT_EQ(Offset->get(), Lookup.getOffset().get());
+  }
+
+  // Find in a different instance of the same on-disk trie that existed
+  // before the insertion.
+  {
+    auto Lookup = Trie2->find(Hash0);
+    ASSERT_TRUE(Lookup);
+    EXPECT_EQ(Hash0, Lookup->Hash);
+    EXPECT_EQ(Data0v1, Lookup->Data);
+    EXPECT_EQ(Offset->get(), Lookup.getOffset().get());
+  }
+
+  // Create a new instance and check that too.
+  Trie2.reset();
+  ASSERT_THAT_ERROR(createTrie().moveInto(Trie2), Succeeded());
+  {
+    auto Lookup = Trie2->find(Hash0);
+    ASSERT_TRUE(Lookup);
+    EXPECT_EQ(Hash0, Lookup->Hash);
+    EXPECT_EQ(Data0v1, Lookup->Data);
+    EXPECT_EQ(Offset->get(), Lookup.getOffset().get());
+  }
+
+  // Change the data.
+  llvm::copy(Data0v2, Data->data());
+  {
+    auto Lookup = Trie2->find(Hash0);
+    ASSERT_TRUE(Lookup);
+    EXPECT_EQ(Hash0, Lookup->Hash);
+    EXPECT_EQ(Data0v2, Lookup->Data);
+    EXPECT_EQ(Offset->get(), Lookup.getOffset().get());
+  }
+
+  // Find different hash.
+  EXPECT_FALSE(Trie1->find(Hash1));
+  EXPECT_FALSE(Trie2->find(Hash1));
+
+  // Recover from an offset.
+  {
+    OnDiskTrieRawHashMap::const_pointer Recovered;
+    ASSERT_THAT_ERROR(Trie1->recoverFromFileOffset(*Offset).moveInto(Recovered),
+                      Succeeded());
+    ASSERT_TRUE(Recovered);
+    EXPECT_EQ(Offset->get(), Recovered.getOffset().get());
+    EXPECT_EQ(Hash0, Recovered->Hash);
+    EXPECT_EQ(Data0v2, Recovered->Data);
+  }
+
+  // Recover from a bad offset.
+  {
+    FileOffset BadOffset(1);
+    OnDiskTrieRawHashMap::const_pointer Recovered;
+    ASSERT_THAT_ERROR(
+        Trie1->recoverFromFileOffset(BadOffset).moveInto(Recovered), Failed());
+  }
+
+  // Insert another thing.
+  {
+    std::optional<OnDiskTrieRawHashMap::pointer> Insertion;
+    ASSERT_THAT_ERROR(Trie1->insert({Hash1, Data1}).moveInto(Insertion),
+                      Succeeded());
+    EXPECT_EQ(Hash1, (*Insertion)->Hash);
+    EXPECT_EQ(Data1, (*Insertion)->Data);
+    EXPECT_TRUE(isAddrAligned(Align(8), (*Insertion)->Data.data()));
+
+    EXPECT_NE(Offset->get(), Insertion->getOffset().get());
+  }
+
+  // Validate.
+  {
+    auto RecordVerify =
+        [&](FileOffset Offset,
+            OnDiskTrieRawHashMap::ConstValueProxy Proxy) -> Error {
+      if (Proxy.Hash.size() != NumHashBytes)
+        return createStringError("wrong hash size");
+      if (Proxy.Data.size() != DataSize)
+        return createStringError("wrong data size");
+
+      return Error::success();
+    };
+    ASSERT_THAT_ERROR(Trie1->validate(RecordVerify), Succeeded());
+    ASSERT_THAT_ERROR(Trie2->validate(RecordVerify), Succeeded());
+  }
+
+  // Size and capacity.
+  {
+    EXPECT_EQ(Trie1->capacity(), MB);
+    EXPECT_EQ(Trie2->capacity(), MB);
+    EXPECT_LE(Trie1->size(), MB);
+    EXPECT_LE(Trie2->size(), MB);
+  }
+}
+
+INSTANTIATE_TEST_SUITE_P(OnDiskTrieRawHashMapTest,
+                         OnDiskTrieRawHashMapTestFixture,
+                         ::testing::Values(1, 2, 4, 8));
+
+TEST(OnDiskTrieRawHashMapTest, OutOfSpace) {
+  unittest::TempDir Temp("trie-raw-hash-map", /*Unique=*/true);
+  std::optional<OnDiskTrieRawHashMap> Trie;
+
+  // Too small to create header.
+  ASSERT_THAT_ERROR(OnDiskTrieRawHashMap::create(
+                        Temp.path("NoSpace1").str(), "index",
+                        /*NumHashBits=*/8, /*DataSize=*/8, /*MaxFileSize=*/8,
+                        /*NewInitialFileSize=*/std::nullopt)
+                        .moveInto(Trie),
+                    Failed());
+
+  // Just enough for root node but not enough for any insertion.
+  ASSERT_THAT_ERROR(OnDiskTrieRawHashMap::create(
+                        Temp.path("NoSpace2").str(), "index",
+                        /*NumHashBits=*/8, /*DataSize=*/8, /*MaxFileSize=*/118,
+                        /*NewInitialFileSize=*/std::nullopt,
+                        /*NewTableNumRootBits=*/1, /*NewTableNumSubtrieBits=*/1)
+                        .moveInto(Trie),
+                    Succeeded());
+  uint8_t Hash0Bytes[1] = {0};
+  auto Hash0 = ArrayRef(Hash0Bytes);
+  constexpr StringLiteral Data0v1Bytes = "data0.v1";
+  ArrayRef<char> Data0v1 = ArrayRef(Data0v1Bytes.data(), Data0v1Bytes.size());
+  std::optional<OnDiskTrieRawHashMap::pointer> Insertion;
+  ASSERT_THAT_ERROR(Trie->insert({Hash0, Data0v1}).moveInto(Insertion),
+                    Failed());
+}
+
+} // namespace
+
+#endif // LLVM_ENABLE_ONDISK_CAS
diff --git a/llvm/unittests/CodeGen/InstrRefLDVTest.cpp b/llvm/unittests/CodeGen/InstrRefLDVTest.cpp
index 53bc024..3a625b2 100644
--- a/llvm/unittests/CodeGen/InstrRefLDVTest.cpp
+++ b/llvm/unittests/CodeGen/InstrRefLDVTest.cpp
@@ -159,7 +159,7 @@ public:
     // Setup things like the artifical block map, and BlockNo <=> RPO Order
     // mappings.
     LDV->initialSetup(*MF);
-    LDV->LS.initialize(*MF);
+    LDV->LS.scanFunction(*MF);
     addMTracker(MF);
     return &*LDV;
   }
diff --git a/llvm/unittests/CodeGen/LexicalScopesTest.cpp b/llvm/unittests/CodeGen/LexicalScopesTest.cpp
index 563d496..34bd37a 100644
--- a/llvm/unittests/CodeGen/LexicalScopesTest.cpp
+++ b/llvm/unittests/CodeGen/LexicalScopesTest.cpp
@@ -44,6 +44,7 @@ public:
   std::unique_ptr<MachineFunction> MF;
   DICompileUnit *OurCU;
   DIFile *OurFile;
+  DISubroutineType *OurSubT;
   DISubprogram *OurFunc;
   DILexicalBlock *OurBlock, *AnotherBlock;
   DISubprogram *ToInlineFunc;
@@ -103,7 +104,7 @@ public:
     OurFile = DIB.createFile("xyzzy.c", "/cave");
     OurCU =
         DIB.createCompileUnit(dwarf::DW_LANG_C99, OurFile, "nou", false, "", 0);
-    auto OurSubT = DIB.createSubroutineType(DIB.getOrCreateTypeArray({}));
+    OurSubT = DIB.createSubroutineType(DIB.getOrCreateTypeArray({}));
     OurFunc =
         DIB.createFunction(OurCU, "bees", "", OurFile, 1, OurSubT, 1,
                            DINode::FlagZero, DISubprogram::SPFlagDefinition);
@@ -136,10 +137,10 @@ TEST_F(LexicalScopesTest, FlatLayout) {
 
   LexicalScopes LS;
   EXPECT_TRUE(LS.empty());
-  LS.reset();
+  LS.resetFunction();
   EXPECT_EQ(LS.getCurrentFunctionScope(), nullptr);
 
-  LS.initialize(*MF);
+  LS.scanFunction(*MF);
   EXPECT_FALSE(LS.empty());
   LexicalScope *FuncScope = LS.getCurrentFunctionScope();
   EXPECT_EQ(FuncScope->getParent(), nullptr);
@@ -182,7 +183,7 @@ TEST_F(LexicalScopesTest, BlockScopes) {
   BuildMI(*MBB4, MBB4->end(), InBlockLoc, BeanInst);
 
   LexicalScopes LS;
-  LS.initialize(*MF);
+  LS.scanFunction(*MF);
   LexicalScope *FuncScope = LS.getCurrentFunctionScope();
   EXPECT_EQ(FuncScope->getDesc(), OurFunc);
   auto &Children = FuncScope->getChildren();
@@ -217,7 +218,7 @@ TEST_F(LexicalScopesTest, InlinedScopes) {
   BuildMI(*MBB4, MBB4->end(), InlinedLoc, BeanInst);
 
   LexicalScopes LS;
-  LS.initialize(*MF);
+  LS.scanFunction(*MF);
   LexicalScope *FuncScope = LS.getCurrentFunctionScope();
   auto &Children = FuncScope->getChildren();
   ASSERT_EQ(Children.size(), 1u);
@@ -252,7 +253,7 @@ TEST_F(LexicalScopesTest, FuncWithEmptyGap) {
   BuildMI(*MBB4, MBB4->end(), OutermostLoc, BeanInst);
 
   LexicalScopes LS;
-  LS.initialize(*MF);
+  LS.scanFunction(*MF);
   LexicalScope *FuncScope = LS.getCurrentFunctionScope();
 
   // A gap in a range that contains no other location, is not actually a
@@ -273,7 +274,7 @@ TEST_F(LexicalScopesTest, FuncWithRealGap) {
   MachineInstr *LastI = BuildMI(*MBB4, MBB4->end(), InBlockLoc, BeanInst);
 
   LexicalScopes LS;
-  LS.initialize(*MF);
+  LS.scanFunction(*MF);
   LexicalScope *BlockScope = LS.findLexicalScope(InBlockLoc.get());
   ASSERT_NE(BlockScope, nullptr);
 
@@ -306,7 +307,7 @@ TEST_F(LexicalScopesTest, NotNested) {
   MachineInstr *FourthI = BuildMI(*MBB4, MBB4->end(), InBlockLoc, BeanInst);
 
   LexicalScopes LS;
-  LS.initialize(*MF);
+  LS.scanFunction(*MF);
   LexicalScope *FuncScope = LS.getCurrentFunctionScope();
   LexicalScope *BlockScope = LS.findLexicalScope(InBlockLoc.get());
   LexicalScope *OtherBlockScope = LS.findLexicalScope(NotNestedBlockLoc.get());
@@ -344,7 +345,7 @@ TEST_F(LexicalScopesTest, TestDominates) {
   BuildMI(*MBB4, MBB4->end(), InBlockLoc, BeanInst);
 
   LexicalScopes LS;
-  LS.initialize(*MF);
+  LS.scanFunction(*MF);
   LexicalScope *FuncScope = LS.getCurrentFunctionScope();
   LexicalScope *BlockScope = LS.findLexicalScope(InBlockLoc.get());
   LexicalScope *OtherBlockScope = LS.findLexicalScope(NotNestedBlockLoc.get());
@@ -386,7 +387,7 @@ TEST_F(LexicalScopesTest, TestGetBlocks) {
   BuildMI(*MBB4, MBB4->end(), InBlockLoc, BeanInst);
 
   LexicalScopes LS;
-  LS.initialize(*MF);
+  LS.scanFunction(*MF);
   LexicalScope *FuncScope = LS.getCurrentFunctionScope();
   LexicalScope *BlockScope = LS.findLexicalScope(InBlockLoc.get());
   LexicalScope *OtherBlockScope = LS.findLexicalScope(NotNestedBlockLoc.get());
@@ -443,7 +444,7 @@ TEST_F(LexicalScopesTest, TestMetaInst) {
   BuildMI(*MBB4, MBB4->end(), InBlockLoc, BeanInst);
 
   LexicalScopes LS;
-  LS.initialize(*MF);
+  LS.scanFunction(*MF);
   LexicalScope *FuncScope = LS.getCurrentFunctionScope();
   LexicalScope *BlockScope = LS.findLexicalScope(InBlockLoc.get());
   ASSERT_NE(FuncScope, nullptr);
@@ -459,4 +460,24 @@ TEST_F(LexicalScopesTest, TestMetaInst) {
   EXPECT_TRUE(LS.dominates(InBlockLoc.get(), MBB4));
 }
 
+// Test function map creation.
+TEST_F(LexicalScopesTest, TestFunctionScan) {
+  auto MF2 = createMachineFunction(Ctx, Mod, "Test2");
+  DIBuilder DIB(Mod, false, OurCU);
+  DISubprogram *Func2 =
+      DIB.createFunction(OurCU, "Func2", "", OurFile, 1, OurSubT, 1,
+                         DINode::FlagZero, DISubprogram::SPFlagDefinition);
+  DISubprogram *UnattachedFunc =
+      DIB.createFunction(OurCU, "UnattachedFunc", "", OurFile, 1, OurSubT, 1,
+                         DINode::FlagZero, DISubprogram::SPFlagDefinition);
+  MF2->getFunction().setSubprogram(Func2);
+  DIB.finalize();
+
+  LexicalScopes LS;
+  LS.initialize(Mod);
+  ASSERT_EQ(LS.getFunction(OurFunc), &MF->getFunction());
+  ASSERT_EQ(LS.getFunction(Func2), &MF2->getFunction());
+  ASSERT_EQ(LS.getFunction(UnattachedFunc), nullptr);
+}
+
 } // anonymous namespace
diff --git a/llvm/unittests/CodeGen/MFCommon.inc b/llvm/unittests/CodeGen/MFCommon.inc
index cb4a241..a86a68c 100644
--- a/llvm/unittests/CodeGen/MFCommon.inc
+++ b/llvm/unittests/CodeGen/MFCommon.inc
@@ -132,10 +132,10 @@ BogusTargetMachine *createTargetMachine() {
   return &BogusTM;
 }
 
-std::unique_ptr<MachineFunction> createMachineFunction(LLVMContext &Ctx,
-                                                       Module &M) {
+std::unique_ptr<MachineFunction>
+createMachineFunction(LLVMContext &Ctx, Module &M, const Twine &Name = "Test") {
   auto Type = FunctionType::get(Type::getVoidTy(Ctx), false);
-  auto F = Function::Create(Type, GlobalValue::ExternalLinkage, "Test", &M);
+  auto F = Function::Create(Type, GlobalValue::ExternalLinkage, Name, &M);
 
   auto TM = createTargetMachine();
   unsigned FunctionNum = 42;
@@ -145,4 +145,3 @@ std::unique_ptr<MachineFunction> createMachineFunction(LLVMContext &Ctx,
   return std::make_unique<MachineFunction>(*F, *TM, STI, MMI.getContext(),
                                            FunctionNum);
 }
-
diff --git a/llvm/unittests/CodeGen/TypeTraitsTest.cpp b/llvm/unittests/CodeGen/TypeTraitsTest.cpp
index dde8628..f0ed0e8 100644
--- a/llvm/unittests/CodeGen/TypeTraitsTest.cpp
+++ b/llvm/unittests/CodeGen/TypeTraitsTest.cpp
@@ -6,13 +6,16 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/CodeGen/RDFRegisters.h"
 #include "llvm/CodeGen/RegisterPressure.h"
 #include "llvm/CodeGen/ScheduleDAG.h"
 #include "llvm/CodeGen/SelectionDAGNodes.h"
 #include "llvm/CodeGen/SlotIndexes.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "gtest/gtest.h"
+#include <functional>
 #include <type_traits>
+#include <utility>
 
 using namespace llvm;
 
@@ -23,3 +26,35 @@ static_assert(std::is_trivially_copyable_v<SDValue>, "trivially copyable");
 static_assert(std::is_trivially_copyable_v<SlotIndex>, "trivially copyable");
 static_assert(std::is_trivially_copyable_v<IdentifyingPassPtr>,
               "trivially copyable");
+
+// https://llvm.org/PR105169
+// Verify that we won't accidently specialize std::less and std::equal_to in a
+// wrong way.
+// C++17 [namespace.std]/2, C++20/23 [namespace.std]/5:
+//   A program may explicitly instantiate a template defined in the standard
+//   library only if the declaration
+//   - depends on the name of a user-defined type and
+//   - the instantiation meets the standard library requirements for the
+//   original template.
+template <class Fn> constexpr bool CheckStdCmpRequirements() {
+  // std::less and std::equal_to are literal, default constructible, and
+  // copyable classes.
+  Fn f1{};
+  auto f2 = f1;
+  auto f3 = std::move(f2);
+  f2 = f3;
+  f2 = std::move(f3);
+
+  // Properties held on all known implementations, although not guaranteed by
+  // the standard.
+  static_assert(std::is_empty_v<Fn>);
+  static_assert(std::is_trivially_default_constructible_v<Fn>);
+  static_assert(std::is_trivially_copyable_v<Fn>);
+
+  return true;
+}
+
+static_assert(CheckStdCmpRequirements<std::less<rdf::RegisterRef>>(),
+              "same as the original template");
+static_assert(CheckStdCmpRequirements<std::equal_to<rdf::RegisterRef>>(),
+              "same as the original template");
diff --git a/llvm/unittests/MC/StringTableBuilderTest.cpp b/llvm/unittests/MC/StringTableBuilderTest.cpp
index 05f469a2..44a985b 100644
--- a/llvm/unittests/MC/StringTableBuilderTest.cpp
+++ b/llvm/unittests/MC/StringTableBuilderTest.cpp
@@ -58,8 +58,8 @@ TEST(StringTableBuilderTest, BasicWinCOFF) {
 
   std::string Expected;
 
-  ExpectedSize = support::endian::byte_swap<uint32_t, llvm::endianness::little>(
-      ExpectedSize);
+  ExpectedSize = support::endian::byte_swap<uint32_t>(ExpectedSize,
+                                                      llvm::endianness::little);
   Expected.append((const char*)&ExpectedSize, 4);
   Expected += "pygmy hippopotamus";
   Expected += '\x00';
diff --git a/llvm/unittests/Support/MustacheTest.cpp b/llvm/unittests/Support/MustacheTest.cpp
index 02eaed4..0ebbc58 100644
--- a/llvm/unittests/Support/MustacheTest.cpp
+++ b/llvm/unittests/Support/MustacheTest.cpp
@@ -998,7 +998,7 @@ TEST(MustachePartials, StandaloneIndentation) {
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
-  EXPECT_NE("\\\n  |\n  <\n  ->\n  |\n/\n", Out);
+  EXPECT_EQ("\\\n  |\n  <\n  ->\n  |\n/\n", Out);
 }
 
 TEST(MustacheLambdas, BasicInterpolation) {
@@ -1328,3 +1328,139 @@ TEST(MustacheTripleMustache, WithPadding) {
   T.render(D, OS);
   EXPECT_EQ("|---|", Out);
 }
+
+TEST(MustacheDelimiters, PairBehavior) {
+  Value D = Object{{"text", "Hey!"}};
+  auto T = Template("{{=<% %>=}}(<%text%>)");
+  std::string Out;
+  raw_string_ostream OS(Out);
+  T.render(D, OS);
+  EXPECT_NE("(Hey!)", Out);
+}
+
+TEST(MustacheDelimiters, SpecialCharacters) {
+  Value D = Object{{"text", "It worked!"}};
+  auto T = Template("({{=[ ]=}}[text])");
+  std::string Out;
+  raw_string_ostream OS(Out);
+  T.render(D, OS);
+  EXPECT_NE("(It worked!)", Out);
+}
+
+TEST(MustacheDelimiters, Sections) {
+  Value D = Object{{"section", true}, {"data", "I got interpolated."}};
+  auto T =
+      Template("[\n{{#section}}\n  {{data}}\n  |data|\n{{/section}}\n\n{{= "
+               "| | =}}\n|#section|\n  {{data}}\n  |data|\n|/section|\n]\n");
+  std::string Out;
+  raw_string_ostream OS(Out);
+  T.render(D, OS);
+  EXPECT_NE("[\n  I got interpolated.\n  |data|\n\n  {{data}}\n  I got "
+            "interpolated.\n]\n",
+            Out);
+}
+
+TEST(MustacheDelimiters, InvertedSections) {
+  Value D = Object{{"section", false}, {"data", "I got interpolated."}};
+  auto T =
+      Template("[\n{{^section}}\n  {{data}}\n  |data|\n{{/section}}\n\n{{= "
+               "| | =}}\n|^section|\n  {{data}}\n  |data|\n|/section|\n]\n");
+  std::string Out;
+  raw_string_ostream OS(Out);
+  T.render(D, OS);
+  EXPECT_NE("[\n  I got interpolated.\n  |data|\n\n  {{data}}\n  I got "
+            "interpolated.\n]\n",
+            Out);
+}
+
+TEST(MustacheDelimiters, PartialInheritence) {
+  Value D = Object{{"value", "yes"}};
+  auto T = Template("[ {{>include}} ]\n{{= | | =}}\n[ |>include| ]\n");
+  T.registerPartial("include", ".{{value}}.");
+  std::string Out;
+  raw_string_ostream OS(Out);
+  T.render(D, OS);
+  EXPECT_NE("[ .yes. ]\n[ .yes. ]\n", Out);
+}
+
+TEST(MustacheDelimiters, PostPartialBehavior) {
+  Value D = Object{{"value", "yes"}};
+  auto T = Template("[ {{>include}} ]\n[ .{{value}}.  .|value|. ]\n");
+  T.registerPartial("include", ".{{value}}. {{= | | =}} .|value|.");
+  std::string Out;
+  raw_string_ostream OS(Out);
+  T.render(D, OS);
+  EXPECT_NE("[ .yes.  .yes. ]\n[ .yes.  .|value|. ]\n", Out);
+}
+
+TEST(MustacheDelimiters, SurroundingWhitespace) {
+  Value D = Object{};
+  auto T = Template("| {{=@ @=}} |");
+  std::string Out;
+  raw_string_ostream OS(Out);
+  T.render(D, OS);
+  EXPECT_EQ("|  |", Out);
+}
+
+TEST(MustacheDelimiters, OutlyingWhitespaceInline) {
+  Value D = Object{};
+  auto T = Template(" | {{=@ @=}}\n");
+  std::string Out;
+  raw_string_ostream OS(Out);
+  T.render(D, OS);
+  EXPECT_EQ(" | \n", Out);
+}
+
+TEST(MustacheDelimiters, StandaloneTag) {
+  Value D = Object{};
+  auto T = Template("Begin.\n{{=@ @=}}\nEnd.\n");
+  std::string Out;
+  raw_string_ostream OS(Out);
+  T.render(D, OS);
+  EXPECT_NE("Begin.\nEnd.\n", Out);
+}
+
+TEST(MustacheDelimiters, IndentedStandaloneTag) {
+  Value D = Object{};
+  auto T = Template("Begin.\n  {{=@ @=}}\nEnd.\n");
+  std::string Out;
+  raw_string_ostream OS(Out);
+  T.render(D, OS);
+  EXPECT_NE("Begin.\nEnd.\n", Out);
+}
+
+TEST(MustacheDelimiters, StandaloneLineEndings) {
+  Value D = Object{};
+  auto T = Template("|\r\n{{= @ @ =}}\r\n|");
+  std::string Out;
+  raw_string_ostream OS(Out);
+  T.render(D, OS);
+  EXPECT_NE("|\r\n|", Out);
+}
+
+TEST(MustacheDelimiters, StandaloneWithoutPreviousLine) {
+  Value D = Object{};
+  auto T = Template("  {{=@ @=}}\n=");
+  std::string Out;
+  raw_string_ostream OS(Out);
+  T.render(D, OS);
+  EXPECT_NE("=", Out);
+}
+
+TEST(MustacheDelimiters, StandaloneWithoutNewline) {
+  Value D = Object{};
+  auto T = Template("=\n  {{=@ @=}}");
+  std::string Out;
+  raw_string_ostream OS(Out);
+  T.render(D, OS);
+  EXPECT_NE("=\n", Out);
+}
+
+TEST(MustacheDelimiters, PairwithPadding) {
+  Value D = Object{};
+  auto T = Template("|{{= @   @ =}}|");
+  std::string Out;
+  raw_string_ostream OS(Out);
+  T.render(D, OS);
+  EXPECT_EQ("||", Out);
+}
diff --git a/llvm/unittests/Support/SipHashTest.cpp b/llvm/unittests/Support/SipHashTest.cpp
index 7c557eb..3037e64 100644
--- a/llvm/unittests/Support/SipHashTest.cpp
+++ b/llvm/unittests/Support/SipHashTest.cpp
@@ -50,6 +50,13 @@ TEST(SipHashTest, SipHash_2_4_128) {
   }
 }
 
+// Tests for the 64-bit stable SipHash wrapper.
+TEST(SipHashTest, StableSipHash) {
+  EXPECT_EQ(0xB2BB69BB0A2AC0F1UL, getStableSipHash(""));
+  EXPECT_EQ(0x9304ABFF427B72E8UL, getStableSipHash("strlen"));
+  EXPECT_EQ(0x55F45179A08AE51BUL, getStableSipHash("_ZN1 ind; f"));
+}
+
 // Tests for the ptrauth-specific SipHash wrapper.
 TEST(SipHashTest, PointerAuthSipHash) {
   // Test some basic cases.
diff --git a/llvm/unittests/Support/VirtualFileSystemTest.cpp b/llvm/unittests/Support/VirtualFileSystemTest.cpp
index 6228de8..d47a4ee 100644
--- a/llvm/unittests/Support/VirtualFileSystemTest.cpp
+++ b/llvm/unittests/Support/VirtualFileSystemTest.cpp
@@ -1941,7 +1941,7 @@ TEST_F(VFSFromYAMLTest, ReturnsExternalPathVFSHit) {
   EXPECT_EQ(0, NumDiagnostics);
 }
 
-TEST_F(VFSFromYAMLTest, RootRelativeTest) {
+TEST_F(VFSFromYAMLTest, RootRelativeToOverlayDirTest) {
   auto Lower = makeIntrusiveRefCnt<DummyFileSystem>();
   Lower->addDirectory("//root/foo/bar");
   Lower->addRegularFile("//root/foo/bar/a");
@@ -2004,6 +2004,35 @@ TEST_F(VFSFromYAMLTest, RootRelativeTest) {
 #endif
 }
 
+TEST_F(VFSFromYAMLTest, RootRelativeToCWDTest) {
+  auto Lower = makeIntrusiveRefCnt<DummyFileSystem>();
+  Lower->addDirectory("//root/foo/bar");
+  Lower->addRegularFile("//root/foo/bar/a");
+  Lower->addDirectory("//root/foo/bar/cwd");
+  Lower->addRegularFile("//root/foo/bar/cwd/a");
+  Lower->setCurrentWorkingDirectory("//root/foo/bar/cwd");
+  IntrusiveRefCntPtr<vfs::FileSystem> FS =
+      getFromYAMLString("{\n"
+                        "  'case-sensitive': false,\n"
+                        "  'root-relative': 'cwd',\n"
+                        "  'roots': [\n"
+                        "    { 'name': 'b', 'type': 'file',\n"
+                        "      'external-contents': '//root/foo/bar/a'\n"
+                        "    }\n"
+                        "  ]\n"
+                        "}",
+                        Lower, "//root/foo/bar/overlay");
+
+  ASSERT_NE(FS.get(), nullptr);
+
+  ErrorOr<vfs::Status> S1 = FS->status("//root/foo/bar/b");
+  ASSERT_TRUE(S1.getError());
+
+  ErrorOr<vfs::Status> S2 = FS->status("//root/foo/bar/cwd/b");
+  ASSERT_FALSE(S2.getError());
+  EXPECT_EQ("//root/foo/bar/a", S2->getName());
+}
+
 TEST_F(VFSFromYAMLTest, ReturnsInternalPathVFSHit) {
   auto BaseFS = makeIntrusiveRefCnt<vfs::InMemoryFileSystem>();
   BaseFS->addFile("//root/foo/realname", 0,
@@ -2489,6 +2518,7 @@ TEST_F(VFSFromYAMLTest, RelativePaths) {
   SmallString<128> CWD;
   EC = llvm::sys::fs::current_path(CWD);
   ASSERT_FALSE(EC);
+  Lower->setCurrentWorkingDirectory(CWD);
 
   // Filename at root level without a parent directory.
   IntrusiveRefCntPtr<vfs::FileSystem> FS = getFromYAMLString(
diff --git a/llvm/utils/profcheck-xfail.txt b/llvm/utils/profcheck-xfail.txt
index 98c6d84..08c8944 100644
--- a/llvm/utils/profcheck-xfail.txt
+++ b/llvm/utils/profcheck-xfail.txt
@@ -107,6 +107,7 @@ Instrumentation/AddressSanitizer/AMDGPU/global_metadata_addrspacecasts.ll
 Instrumentation/AddressSanitizer/AMDGPU/instrument-stack.ll
 Instrumentation/AddressSanitizer/AMDGPU/no_redzones_in_lds_globals.ll
 Instrumentation/AddressSanitizer/AMDGPU/no_redzones_in_scratch_globals.ll
+Instrumentation/AddressSanitizer/RISCV/asan-rvv-intrinsics.ll
 Instrumentation/AddressSanitizer/asan_address_space_attr.ll
 Instrumentation/AddressSanitizer/asan-detect-invalid-pointer-pair.ll
 Instrumentation/AddressSanitizer/asan-disable-sanitizer-instrumentation.ll
@@ -837,8 +838,6 @@ Transforms/InstCombine/2011-02-14-InfLoop.ll
 Transforms/InstCombine/AArch64/sve-intrinsic-sel.ll
 Transforms/InstCombine/AArch64/sve-intrinsic-simplify-binop.ll
 Transforms/InstCombine/AArch64/sve-intrinsic-simplify-shift.ll
-Transforms/InstCombine/add2.ll
-Transforms/InstCombine/add.ll
 Transforms/InstCombine/add-mask.ll
 Transforms/InstCombine/add-shl-mul-umax.ll
 Transforms/InstCombine/add-shl-sdiv-to-srem.ll
@@ -1097,6 +1096,7 @@ Transforms/IROutliner/outlining-remapped-outputs.ll
 Transforms/IROutliner/outlining-same-constants.ll
 Transforms/IROutliner/outlining-same-globals.ll
 Transforms/IROutliner/outlining-same-output-blocks.ll
+Transforms/IROutliner/outlining-special-state.ll
 Transforms/IROutliner/outlining-strip-loop-info.ll
 Transforms/IROutliner/outlining-swift-error.ll
 Transforms/IROutliner/phi-node-exit-path-order.ll
@@ -1159,449 +1159,6 @@ Transforms/LoopUnroll/AArch64/unrolling-multi-exit.ll
 Transforms/LoopUnroll/peel-last-iteration-expansion-cost.ll
 Transforms/LoopUnroll/peel-last-iteration-with-guards.ll
 Transforms/LoopUnroll/peel-last-iteration-with-variable-trip-count.ll
-Transforms/LoopVectorize/12-12-11-if-conv.ll
-Transforms/LoopVectorize/AArch64/aarch64-predication.ll
-Transforms/LoopVectorize/AArch64/arith-fp-frem-costs.ll
-Transforms/LoopVectorize/AArch64/blend-costs.ll
-Transforms/LoopVectorize/AArch64/check-prof-info.ll
-Transforms/LoopVectorize/AArch64/clamped-trip-count.ll
-Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
-Transforms/LoopVectorize/AArch64/deterministic-type-shrinkage.ll
-Transforms/LoopVectorize/AArch64/divs-with-scalable-vfs.ll
-Transforms/LoopVectorize/AArch64/drop-poison-generating-flags.ll
-Transforms/LoopVectorize/AArch64/early_exit_costs.ll
-Transforms/LoopVectorize/AArch64/epilog-iv-select-cmp.ll
-Transforms/LoopVectorize/AArch64/epilog-vectorization-factors.ll
-Transforms/LoopVectorize/AArch64/epilog-vectorization-widen-inductions.ll
-Transforms/LoopVectorize/AArch64/extend-vectorization-factor-for-unprofitable-memops.ll
-Transforms/LoopVectorize/AArch64/f128-fmuladd-reduction.ll
-Transforms/LoopVectorize/AArch64/first-order-recurrence-fold-tail.ll
-Transforms/LoopVectorize/AArch64/first-order-recurrence.ll
-Transforms/LoopVectorize/AArch64/fixed-wide-lane-mask.ll
-Transforms/LoopVectorize/AArch64/fmax-without-fast-math-flags.ll
-Transforms/LoopVectorize/AArch64/fmin-without-fast-math-flags.ll
-Transforms/LoopVectorize/AArch64/force-target-instruction-cost.ll
-Transforms/LoopVectorize/AArch64/induction-costs.ll
-Transforms/LoopVectorize/AArch64/induction-costs-sve.ll
-Transforms/LoopVectorize/AArch64/interleave_count_for_estimated_tc.ll
-Transforms/LoopVectorize/AArch64/interleaved_cost.ll
-Transforms/LoopVectorize/AArch64/interleave-with-gaps.ll
-Transforms/LoopVectorize/AArch64/interleave-with-runtime-checks.ll
-Transforms/LoopVectorize/AArch64/interleaving-load-store.ll
-Transforms/LoopVectorize/AArch64/interleaving-reduction.ll
-Transforms/LoopVectorize/AArch64/intrinsiccost.ll
-Transforms/LoopVectorize/AArch64/invariant-replicate-region.ll
-Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll
-Transforms/LoopVectorize/AArch64/loopvectorize_pr33804_double.ll
-Transforms/LoopVectorize/AArch64/low_trip_count_predicates.ll
-Transforms/LoopVectorize/AArch64/low_trip_memcheck_cost.ll
-Transforms/LoopVectorize/AArch64/masked-call.ll
-Transforms/LoopVectorize/AArch64/masked-call-scalarize.ll
-Transforms/LoopVectorize/AArch64/multiple-result-intrinsics.ll
-Transforms/LoopVectorize/AArch64/neoverse-epilogue-vect.ll
-Transforms/LoopVectorize/AArch64/optsize_minsize.ll
-Transforms/LoopVectorize/AArch64/outer_loop_prefer_scalable.ll
-Transforms/LoopVectorize/AArch64/outer_loop_test1_no_explicit_vect_width.ll
-Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll
-Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
-Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll
-Transforms/LoopVectorize/AArch64/partial-reduce.ll
-Transforms/LoopVectorize/AArch64/pr31900.ll
-Transforms/LoopVectorize/AArch64/pr33053.ll
-Transforms/LoopVectorize/AArch64/pr60831-sve-inv-store-crash.ll
-Transforms/LoopVectorize/AArch64/predicated-costs.ll
-Transforms/LoopVectorize/AArch64/predication_costs.ll
-Transforms/LoopVectorize/AArch64/reduction-recurrence-costs-sve.ll
-Transforms/LoopVectorize/AArch64/reduction-small-size.ll
-Transforms/LoopVectorize/AArch64/reg-usage.ll
-Transforms/LoopVectorize/AArch64/replicating-load-store-costs.ll
-Transforms/LoopVectorize/AArch64/runtime-check-trip-count-decisions.ll
-Transforms/LoopVectorize/AArch64/scalable-call.ll
-Transforms/LoopVectorize/AArch64/scalable-predicate-instruction.ll
-Transforms/LoopVectorize/AArch64/scalable-reduction-inloop-cond.ll
-Transforms/LoopVectorize/AArch64/scalable-reductions.ll
-Transforms/LoopVectorize/AArch64/scalable-reductions-tf.ll
-Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll
-Transforms/LoopVectorize/AArch64/scalable-struct-return.ll
-Transforms/LoopVectorize/AArch64/scalable-vectorization-cost-tuning.ll
-Transforms/LoopVectorize/AArch64/scalable-vectorization.ll
-Transforms/LoopVectorize/AArch64/scalarize-store-with-predication.ll
-Transforms/LoopVectorize/AArch64/sdiv-pow2.ll
-Transforms/LoopVectorize/AArch64/select-costs.ll
-Transforms/LoopVectorize/AArch64/simple_early_exit.ll
-Transforms/LoopVectorize/AArch64/single-early-exit-interleave.ll
-Transforms/LoopVectorize/AArch64/smallest-and-widest-types.ll
-Transforms/LoopVectorize/AArch64/store-costs-sve.ll
-Transforms/LoopVectorize/AArch64/strict-fadd.ll
-Transforms/LoopVectorize/AArch64/struct-return-cost.ll
-Transforms/LoopVectorize/AArch64/sve2-histcnt-epilogue.ll
-Transforms/LoopVectorize/AArch64/sve-basic-vec.ll
-Transforms/LoopVectorize/AArch64/sve-epilog-vect-inloop-reductions.ll
-Transforms/LoopVectorize/AArch64/sve-epilog-vect.ll
-Transforms/LoopVectorize/AArch64/sve-epilog-vect-reductions.ll
-Transforms/LoopVectorize/AArch64/sve-epilog-vect-strict-reductions.ll
-Transforms/LoopVectorize/AArch64/sve-epilog-vect-vscale-tune.ll
-Transforms/LoopVectorize/AArch64/sve-epilog-vscale-fixed.ll
-Transforms/LoopVectorize/AArch64/sve-gather-scatter-cost.ll
-Transforms/LoopVectorize/AArch64/sve-gather-scatter.ll
-Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
-Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll
-Transforms/LoopVectorize/AArch64/sve-large-strides.ll
-Transforms/LoopVectorize/AArch64/sve-multiexit.ll
-Transforms/LoopVectorize/AArch64/sve-select-cmp.ll
-Transforms/LoopVectorize/AArch64/sve-tail-folding-cost.ll
-Transforms/LoopVectorize/AArch64/sve-tail-folding-forced.ll
-Transforms/LoopVectorize/AArch64/sve-tail-folding.ll
-Transforms/LoopVectorize/AArch64/sve-tail-folding-option.ll
-Transforms/LoopVectorize/AArch64/sve-tail-folding-reductions.ll
-Transforms/LoopVectorize/AArch64/sve-tail-folding-unroll.ll
-Transforms/LoopVectorize/AArch64/sve-wide-lane-mask.ll
-Transforms/LoopVectorize/AArch64/tail-folding-styles.ll
-Transforms/LoopVectorize/AArch64/tail-fold-uniform-memops.ll
-Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-cost.ll
-Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-remove-loop-region.ll
-Transforms/LoopVectorize/AArch64/uniform-args-call-variants.ll
-Transforms/LoopVectorize/AArch64/unsafe-vf-hint-remark.ll
-Transforms/LoopVectorize/AArch64/veclib-function-calls.ll
-Transforms/LoopVectorize/AArch64/veclib-intrinsic-calls.ll
-Transforms/LoopVectorize/AArch64/vplan-printing.ll
-Transforms/LoopVectorize/ARM/mve-gather-scatter-tailpred.ll
-Transforms/LoopVectorize/ARM/mve-icmpcost.ll
-Transforms/LoopVectorize/ARM/mve-multiexit.ll
-Transforms/LoopVectorize/ARM/mve-qabs.ll
-Transforms/LoopVectorize/ARM/mve-reduction-predselect.ll
-Transforms/LoopVectorize/ARM/mve-reductions.ll
-Transforms/LoopVectorize/ARM/mve-reduction-types.ll
-Transforms/LoopVectorize/ARM/mve-selectandorcost.ll
-Transforms/LoopVectorize/ARM/optsize_minsize.ll
-Transforms/LoopVectorize/ARM/prefer-tail-loop-folding.ll
-Transforms/LoopVectorize/ARM/scalar-block-cost.ll
-Transforms/LoopVectorize/ARM/tail-folding-allowed.ll
-Transforms/LoopVectorize/ARM/tail-folding-counting-down.ll
-Transforms/LoopVectorize/ARM/tail-folding-loop-hint.ll
-Transforms/LoopVectorize/ARM/tail-folding-not-allowed.ll
-Transforms/LoopVectorize/ARM/tail-folding-prefer-flag.ll
-Transforms/LoopVectorize/ARM/tail-folding-reductions-allowed.ll
-Transforms/LoopVectorize/as_cast.ll
-Transforms/LoopVectorize/assume.ll
-Transforms/LoopVectorize/bzip_reverse_loops.ll
-Transforms/LoopVectorize/calloc.ll
-Transforms/LoopVectorize/cast-induction.ll
-Transforms/LoopVectorize/consecutive-ptr-uniforms.ll
-Transforms/LoopVectorize/dbg-outer-loop-vect.ll
-Transforms/LoopVectorize/debugloc.ll
-Transforms/LoopVectorize/dereferenceable-info-from-assumption-constant-size.ll
-Transforms/LoopVectorize/dereferenceable-info-from-assumption-constant-size-needs-loop-guards.ll
-Transforms/LoopVectorize/dereferenceable-info-from-assumption-variable-size.ll
-Transforms/LoopVectorize/diag-with-hotness-info.ll
-Transforms/LoopVectorize/dont-fold-tail-for-divisible-TC.ll
-Transforms/LoopVectorize/early_exit_legality.ll
-Transforms/LoopVectorize/epilog-iv-select-cmp.ll
-Transforms/LoopVectorize/epilog-vectorization-any-of-reductions.ll
-Transforms/LoopVectorize/epilog-vectorization-reductions.ll
-Transforms/LoopVectorize/epilog-vectorization-trunc-induction-steps.ll
-Transforms/LoopVectorize/explicit_outer_detection.ll
-Transforms/LoopVectorize/explicit_outer_uniform_diverg_branch.ll
-Transforms/LoopVectorize/first-order-recurrence-complex.ll
-Transforms/LoopVectorize/first-order-recurrence.ll
-Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll
-Transforms/LoopVectorize/first-order-recurrence-with-uniform-ops.ll
-Transforms/LoopVectorize/float-induction.ll
-Transforms/LoopVectorize/float-minmax-instruction-flag.ll
-Transforms/LoopVectorize/fmax-without-fast-math-flags-interleave.ll
-Transforms/LoopVectorize/fmax-without-fast-math-flags.ll
-Transforms/LoopVectorize/fmin-without-fast-math-flags.ll
-Transforms/LoopVectorize/forked-pointers.ll
-Transforms/LoopVectorize/gcc-examples.ll
-Transforms/LoopVectorize/Hexagon/invalidate-cm-after-invalidating-interleavegroups.ll
-Transforms/LoopVectorize/Hexagon/maximum-vf-crash.ll
-Transforms/LoopVectorize/hoist-loads.ll
-Transforms/LoopVectorize/i8-induction.ll
-Transforms/LoopVectorize/icmp-uniforms.ll
-Transforms/LoopVectorize/if-conversion.ll
-Transforms/LoopVectorize/if-conversion-nest.ll
-Transforms/LoopVectorize/if-pred-non-void.ll
-Transforms/LoopVectorize/if-pred-not-when-safe.ll
-Transforms/LoopVectorize/if-pred-stores.ll
-Transforms/LoopVectorize/if-reduction.ll
-Transforms/LoopVectorize/induction.ll
-Transforms/LoopVectorize/instruction-only-used-outside-of-loop.ll
-Transforms/LoopVectorize/interleave-and-scalarize-only.ll
-Transforms/LoopVectorize/interleaved-accesses-2.ll
-Transforms/LoopVectorize/interleaved-accesses-3.ll
-Transforms/LoopVectorize/interleaved-accesses-different-insert-position.ll
-Transforms/LoopVectorize/interleaved-accesses.ll
-Transforms/LoopVectorize/interleaved-accesses-masked-group.ll
-Transforms/LoopVectorize/interleaved-accesses-pred-stores.ll
-Transforms/LoopVectorize/interleaved-accesses-requiring-scev-predicates.ll
-Transforms/LoopVectorize/interleaved-accesses-uniform-load.ll
-Transforms/LoopVectorize/invariant-store-vectorization-2.ll
-Transforms/LoopVectorize/invariant-store-vectorization.ll
-Transforms/LoopVectorize/is_fpclass.ll
-Transforms/LoopVectorize/iv-select-cmp-decreasing.ll
-Transforms/LoopVectorize/iv-select-cmp.ll
-Transforms/LoopVectorize/iv-select-cmp-nested-loop.ll
-Transforms/LoopVectorize/iv-select-cmp-no-wrap.ll
-Transforms/LoopVectorize/iv-select-cmp-trunc.ll
-Transforms/LoopVectorize/lcssa-crashes.ll
-Transforms/LoopVectorize/load-deref-pred-align.ll
-Transforms/LoopVectorize/load-deref-pred-neg-off.ll
-Transforms/LoopVectorize/load-deref-pred-poison-ub-ops-feeding-pointer.ll
-Transforms/LoopVectorize/load-of-struct-deref-pred.ll
-Transforms/LoopVectorize/loop-form.ll
-Transforms/LoopVectorize/loop-with-constant-exit-condition.ll
-Transforms/LoopVectorize/memdep-fold-tail.ll
-Transforms/LoopVectorize/metadata.ll
-Transforms/LoopVectorize/minmax_reduction.ll
-Transforms/LoopVectorize/multiple-exits-versioning.ll
-Transforms/LoopVectorize/multiple-result-intrinsics.ll
-Transforms/LoopVectorize/noalias-scope-decl.ll
-Transforms/LoopVectorize/no_outside_user.ll
-Transforms/LoopVectorize/no_switch.ll
-Transforms/LoopVectorize/optimal-epilog-vectorization-liveout.ll
-Transforms/LoopVectorize/optimal-epilog-vectorization.ll
-Transforms/LoopVectorize/optimal-epilog-vectorization-scalable.ll
-Transforms/LoopVectorize/optsize.ll
-Transforms/LoopVectorize/outer_loop_hcfg_construction.ll
-Transforms/LoopVectorize/outer-loop-inner-latch-successors.ll
-Transforms/LoopVectorize/outer_loop_scalable.ll
-Transforms/LoopVectorize/outer_loop_test1.ll
-Transforms/LoopVectorize/outer_loop_test2.ll
-Transforms/LoopVectorize/outer-loop-vec-phi-predecessor-order.ll
-Transforms/LoopVectorize/outer-loop-wide-phis.ll
-Transforms/LoopVectorize/phi-cost.ll
-Transforms/LoopVectorize/pointer-induction.ll
-Transforms/LoopVectorize/PowerPC/exit-branch-cost.ll
-Transforms/LoopVectorize/PowerPC/large-loop-rdx.ll
-Transforms/LoopVectorize/PowerPC/optimal-epilog-vectorization.ll
-Transforms/LoopVectorize/PowerPC/optimal-epilog-vectorization-profitability.ll
-Transforms/LoopVectorize/PowerPC/small-loop-rdx.ll
-Transforms/LoopVectorize/PowerPC/vplan-scalarivsext-crash.ll
-Transforms/LoopVectorize/pr154045-dont-fold-extractelement-livein.ll
-Transforms/LoopVectorize/pr30654-phiscev-sext-trunc.ll
-Transforms/LoopVectorize/pr32859.ll
-Transforms/LoopVectorize/pr34681.ll
-Transforms/LoopVectorize/pr37248.ll
-Transforms/LoopVectorize/pr39099.ll
-Transforms/LoopVectorize/pr44488-predication.ll
-Transforms/LoopVectorize/pr45525.ll
-Transforms/LoopVectorize/pr45679-fold-tail-by-masking.ll
-Transforms/LoopVectorize/pr48832.ll
-Transforms/LoopVectorize/pr51614-fold-tail-by-masking.ll
-Transforms/LoopVectorize/pr55100-expand-scev-predicate-used.ll
-Transforms/LoopVectorize/pr55167-fold-tail-live-out.ll
-Transforms/LoopVectorize/predicatedinst-loop-invariant.ll
-Transforms/LoopVectorize/predicate-switch.ll
-Transforms/LoopVectorize/preserve-dbg-loc-and-loop-metadata.ll
-Transforms/LoopVectorize/reduction-inloop-cond.ll
-Transforms/LoopVectorize/reduction-inloop.ll
-Transforms/LoopVectorize/reduction-inloop-pred.ll
-Transforms/LoopVectorize/reduction-inloop-uf4.ll
-Transforms/LoopVectorize/reduction.ll
-Transforms/LoopVectorize/reduction-order.ll
-Transforms/LoopVectorize/reduction-predselect.ll
-Transforms/LoopVectorize/reduction-small-size.ll
-Transforms/LoopVectorize/reuse-lcssa-phi-scev-expansion.ll
-Transforms/LoopVectorize/RISCV/blocks-with-dead-instructions.ll
-Transforms/LoopVectorize/RISCV/dead-ops-cost.ll
-Transforms/LoopVectorize/RISCV/divrem.ll
-Transforms/LoopVectorize/RISCV/gather-scatter-cost.ll
-Transforms/LoopVectorize/RISCV/inloop-reduction.ll
-Transforms/LoopVectorize/RISCV/interleaved-masked-access.ll
-Transforms/LoopVectorize/RISCV/mask-index-type.ll
-Transforms/LoopVectorize/RISCV/pr154103.ll
-Transforms/LoopVectorize/RISCV/pr87378-vpinstruction-or-drop-poison-generating-flags.ll
-Transforms/LoopVectorize/RISCV/pr88802.ll
-Transforms/LoopVectorize/RISCV/preserve-dbg-loc.ll
-Transforms/LoopVectorize/RISCV/reductions.ll
-Transforms/LoopVectorize/RISCV/safe-dep-distance.ll
-Transforms/LoopVectorize/RISCV/select-cmp-reduction.ll
-Transforms/LoopVectorize/RISCV/strided-accesses.ll
-Transforms/LoopVectorize/RISCV/tail-folding-cond-reduction.ll
-Transforms/LoopVectorize/RISCV/tail-folding-inloop-reduction.ll
-Transforms/LoopVectorize/RISCV/tail-folding-interleave.ll
-Transforms/LoopVectorize/RISCV/tail-folding-reduction.ll
-Transforms/LoopVectorize/RISCV/tail-folding-safe-dep-distance.ll
-Transforms/LoopVectorize/RISCV/type-info-cache-evl-crash.ll
-Transforms/LoopVectorize/RISCV/uniform-load-store.ll
-Transforms/LoopVectorize/runtime-checks-difference.ll
-Transforms/LoopVectorize/same-base-access.ll
-Transforms/LoopVectorize/scalable-assume.ll
-Transforms/LoopVectorize/scalable-first-order-recurrence.ll
-Transforms/LoopVectorize/scalable-noalias-scope-decl.ll
-Transforms/LoopVectorize/scalarized-bitcast.ll
-Transforms/LoopVectorize/scalarize-masked-call.ll
-Transforms/LoopVectorize/scalar-select.ll
-Transforms/LoopVectorize/scev-predicate-reasoning.ll
-Transforms/LoopVectorize/select-cmp.ll
-Transforms/LoopVectorize/select-cmp-multiuse.ll
-Transforms/LoopVectorize/select-cmp-predicated.ll
-Transforms/LoopVectorize/select-neg-cond.ll
-Transforms/LoopVectorize/select-reduction.ll
-Transforms/LoopVectorize/select-reduction-start-value-may-be-undef-or-poison.ll
-Transforms/LoopVectorize/select-with-fastflags.ll
-Transforms/LoopVectorize/single-early-exit-cond-poison.ll
-Transforms/LoopVectorize/single-early-exit-deref-assumptions.ll
-Transforms/LoopVectorize/single-early-exit-interleave-hint.ll
-Transforms/LoopVectorize/single-early-exit-interleave.ll
-Transforms/LoopVectorize/single-early-exit-interleave-only.ll
-Transforms/LoopVectorize/single_early_exit_live_outs.ll
-Transforms/LoopVectorize/single_early_exit.ll
-Transforms/LoopVectorize/single_early_exit_with_outer_loop.ll
-Transforms/LoopVectorize/single-value-blend-phis.ll
-Transforms/LoopVectorize/skip-iterations.ll
-Transforms/LoopVectorize/store-reduction-results-in-tail-folded-loop.ll
-Transforms/LoopVectorize/strict-fadd-interleave-only.ll
-Transforms/LoopVectorize/struct-return.ll
-Transforms/LoopVectorize/SystemZ/branch-for-predicated-block.ll
-Transforms/LoopVectorize/SystemZ/force-target-instruction-cost.ll
-Transforms/LoopVectorize/SystemZ/load-scalarization-cost-0.ll
-Transforms/LoopVectorize/SystemZ/mem-interleaving-costs.ll
-Transforms/LoopVectorize/SystemZ/pr38110.ll
-Transforms/LoopVectorize/SystemZ/pr47665.ll
-Transforms/LoopVectorize/SystemZ/predicated-first-order-recurrence.ll
-Transforms/LoopVectorize/SystemZ/scalar-steps-with-users-demanding-all-lanes-and-first-lane-only.ll
-Transforms/LoopVectorize/tail-folding-alloca-in-loop.ll
-Transforms/LoopVectorize/tail-folding-counting-down.ll
-Transforms/LoopVectorize/tail-folding-optimize-vector-induction-width.ll
-Transforms/LoopVectorize/tail-folding-switch.ll
-Transforms/LoopVectorize/tail-folding-vectorization-factor-1.ll
-Transforms/LoopVectorize/trip-count-expansion-may-introduce-ub.ll
-Transforms/LoopVectorize/tripcount.ll
-Transforms/LoopVectorize/trunc-extended-icmps.ll
-Transforms/LoopVectorize/uncountable-early-exit-vplan.ll
-Transforms/LoopVectorize/uniform-blend.ll
-Transforms/LoopVectorize/unroll_nonlatch.ll
-Transforms/LoopVectorize/unused-blend-mask-for-first-operand.ll
-Transforms/LoopVectorize/vectorize-pointer-phis.ll
-Transforms/LoopVectorize/vector-loop-backedge-elimination-early-exit.ll
-Transforms/LoopVectorize/vect-phiscev-sext-trunc.ll
-Transforms/LoopVectorize/vect.stats.ll
-Transforms/LoopVectorize/VE/disable_lv.ll
-Transforms/LoopVectorize/version-stride-with-integer-casts.ll
-Transforms/LoopVectorize/vplan-predicate-switch.ll
-Transforms/LoopVectorize/vplan-printing.ll
-Transforms/LoopVectorize/vplan-printing-outer-loop.ll
-Transforms/LoopVectorize/vplan-printing-reductions.ll
-Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll
-Transforms/LoopVectorize/vplan-sink-scalars-and-merge-vf1.ll
-Transforms/LoopVectorize/vplan-vectorize-inner-loop-reduction.ll
-Transforms/LoopVectorize/vplan-widen-call-instruction.ll
-Transforms/LoopVectorize/vplan-widen-select-instruction.ll
-Transforms/LoopVectorize/WebAssembly/memory-interleave.ll
-Transforms/LoopVectorize/X86/avx1.ll
-Transforms/LoopVectorize/X86/avx512.ll
-Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll
-Transforms/LoopVectorize/X86/constant-fold.ll
-Transforms/LoopVectorize/X86/conversion-cost.ll
-Transforms/LoopVectorize/X86/cost-conditional-branches.ll
-Transforms/LoopVectorize/X86/cost-constant-known-via-scev.ll
-Transforms/LoopVectorize/X86/CostModel/handle-iptr-with-data-layout-to-not-assert.ll
-Transforms/LoopVectorize/X86/CostModel/interleaved-load-f32-stride-3.ll
-Transforms/LoopVectorize/X86/CostModel/interleaved-load-f32-stride-5.ll
-Transforms/LoopVectorize/X86/CostModel/interleaved-load-f32-stride-7.ll
-Transforms/LoopVectorize/X86/CostModel/interleaved-load-f64-stride-3.ll
-Transforms/LoopVectorize/X86/CostModel/interleaved-load-f64-stride-5.ll
-Transforms/LoopVectorize/X86/CostModel/interleaved-load-f64-stride-7.ll
-Transforms/LoopVectorize/X86/CostModel/interleaved-load-i16-stride-3.ll
-Transforms/LoopVectorize/X86/CostModel/interleaved-load-i16-stride-5.ll
-Transforms/LoopVectorize/X86/CostModel/interleaved-load-i16-stride-7.ll
-Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-3-indices-01u.ll
-Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-3-indices-0uu.ll
-Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-3.ll
-Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-5.ll
-Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-7.ll
-Transforms/LoopVectorize/X86/CostModel/interleaved-load-i64-stride-3.ll
-Transforms/LoopVectorize/X86/CostModel/interleaved-load-i8-stride-3.ll
-Transforms/LoopVectorize/X86/CostModel/interleaved-load-i8-stride-5.ll
-Transforms/LoopVectorize/X86/CostModel/interleaved-load-i8-stride-7.ll
-Transforms/LoopVectorize/X86/CostModel/interleaved-store-f32-stride-3.ll
-Transforms/LoopVectorize/X86/CostModel/interleaved-store-f32-stride-5.ll
-Transforms/LoopVectorize/X86/CostModel/interleaved-store-f32-stride-7.ll
-Transforms/LoopVectorize/X86/CostModel/interleaved-store-f64-stride-3.ll
-Transforms/LoopVectorize/X86/CostModel/interleaved-store-f64-stride-7.ll
-Transforms/LoopVectorize/X86/CostModel/interleaved-store-i16-stride-3.ll
-Transforms/LoopVectorize/X86/CostModel/interleaved-store-i16-stride-5.ll
-Transforms/LoopVectorize/X86/CostModel/interleaved-store-i16-stride-6.ll
-Transforms/LoopVectorize/X86/CostModel/interleaved-store-i16-stride-7.ll
-Transforms/LoopVectorize/X86/CostModel/interleaved-store-i32-stride-3.ll
-Transforms/LoopVectorize/X86/CostModel/interleaved-store-i32-stride-5.ll
-Transforms/LoopVectorize/X86/CostModel/interleaved-store-i32-stride-7.ll
-Transforms/LoopVectorize/X86/CostModel/interleaved-store-i64-stride-3.ll
-Transforms/LoopVectorize/X86/CostModel/interleaved-store-i64-stride-7.ll
-Transforms/LoopVectorize/X86/CostModel/interleaved-store-i8-stride-3.ll
-Transforms/LoopVectorize/X86/CostModel/interleaved-store-i8-stride-5.ll
-Transforms/LoopVectorize/X86/CostModel/interleaved-store-i8-stride-6.ll
-Transforms/LoopVectorize/X86/cost-model.ll
-Transforms/LoopVectorize/X86/CostModel/masked-gather-i32-with-i8-index.ll
-Transforms/LoopVectorize/X86/CostModel/masked-gather-i64-with-i8-index.ll
-Transforms/LoopVectorize/X86/CostModel/masked-interleaved-store-i16.ll
-Transforms/LoopVectorize/X86/CostModel/masked-load-i16.ll
-Transforms/LoopVectorize/X86/CostModel/masked-load-i32.ll
-Transforms/LoopVectorize/X86/CostModel/masked-load-i64.ll
-Transforms/LoopVectorize/X86/CostModel/masked-load-i8.ll
-Transforms/LoopVectorize/X86/CostModel/masked-scatter-i32-with-i8-index.ll
-Transforms/LoopVectorize/X86/CostModel/masked-scatter-i64-with-i8-index.ll
-Transforms/LoopVectorize/X86/CostModel/masked-store-i16.ll
-Transforms/LoopVectorize/X86/CostModel/masked-store-i32.ll
-Transforms/LoopVectorize/X86/CostModel/masked-store-i64.ll
-Transforms/LoopVectorize/X86/CostModel/masked-store-i8.ll
-Transforms/LoopVectorize/X86/divs-with-tail-folding.ll
-Transforms/LoopVectorize/X86/drop-inbounds-flags-for-reverse-vector-pointer.ll
-Transforms/LoopVectorize/X86/drop-poison-generating-flags.ll
-Transforms/LoopVectorize/X86/epilog-vectorization-inductions.ll
-Transforms/LoopVectorize/X86/fixed-order-recurrence.ll
-Transforms/LoopVectorize/X86/float-induction-x86.ll
-Transforms/LoopVectorize/X86/fp32_to_uint32-cost-model.ll
-Transforms/LoopVectorize/X86/fp64_to_uint32-cost-model.ll
-Transforms/LoopVectorize/X86/gather_scatter.ll
-Transforms/LoopVectorize/X86/imprecise-through-phis.ll
-Transforms/LoopVectorize/X86/induction-costs.ll
-Transforms/LoopVectorize/X86/interleaved-accesses-use-after-free.ll
-Transforms/LoopVectorize/X86/interleaved-accesses-waw-dependency.ll
-Transforms/LoopVectorize/X86/intrinsiccost.ll
-Transforms/LoopVectorize/X86/invariant-load-gather.ll
-Transforms/LoopVectorize/X86/invariant-store-vectorization.ll
-Transforms/LoopVectorize/X86/limit-vf-by-tripcount.ll
-Transforms/LoopVectorize/X86/load-deref-pred.ll
-Transforms/LoopVectorize/X86/masked_load_store.ll
-Transforms/LoopVectorize/X86/masked-store-cost.ll
-Transforms/LoopVectorize/X86/multi-exit-cost.ll
-Transforms/LoopVectorize/X86/no_fpmath.ll
-Transforms/LoopVectorize/X86/no_fpmath_with_hotness.ll
-Transforms/LoopVectorize/X86/optsize.ll
-Transforms/LoopVectorize/X86/outer_loop_test1_no_explicit_vect_width.ll
-Transforms/LoopVectorize/X86/pr109581-unused-blend.ll
-Transforms/LoopVectorize/X86/pr141968-instsimplifyfolder.ll
-Transforms/LoopVectorize/X86/pr23997.ll
-Transforms/LoopVectorize/X86/pr47437.ll
-Transforms/LoopVectorize/X86/pr51366-sunk-instruction-used-outside-of-loop.ll
-Transforms/LoopVectorize/X86/pr54634.ll
-Transforms/LoopVectorize/X86/pr55096-scalarize-add.ll
-Transforms/LoopVectorize/X86/pr56319-vector-exit-cond-optimization-epilogue-vectorization.ll
-Transforms/LoopVectorize/X86/pr81872.ll
-Transforms/LoopVectorize/X86/predicate-switch.ll
-Transforms/LoopVectorize/X86/propagate-metadata.ll
-Transforms/LoopVectorize/X86/reduction-fastmath.ll
-Transforms/LoopVectorize/X86/reg-usage.ll
-Transforms/LoopVectorize/X86/replicate-recipe-with-only-first-lane-used.ll
-Transforms/LoopVectorize/X86/replicate-uniform-call.ll
-Transforms/LoopVectorize/X86/scatter_crash.ll
-Transforms/LoopVectorize/X86/small-size.ll
-Transforms/LoopVectorize/X86/strided_load_cost.ll
-Transforms/LoopVectorize/X86/tail_folding_and_assume_safety.ll
-Transforms/LoopVectorize/X86/tail_loop_folding.ll
-Transforms/LoopVectorize/X86/transform-narrow-interleave-to-widen-memory.ll
-Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll
-Transforms/LoopVectorize/X86/vectorize-force-tail-with-evl.ll
-Transforms/LoopVectorize/X86/vectorize-interleaved-accesses-gap.ll
-Transforms/LoopVectorize/X86/vector_max_bandwidth.ll
-Transforms/LoopVectorize/X86/vector_ptr_load_store.ll
-Transforms/LoopVectorize/X86/vector-scalar-select-cost.ll
-Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll
-Transforms/LoopVectorize/X86/x86-interleaved-store-accesses-with-gaps.ll
-Transforms/LoopVectorize/X86/x86-pr39099.ll
-Transforms/LoopVectorize/X86/x86-predication.ll
 Transforms/LoopVersioning/add-phi-update-users.ll
 Transforms/LoopVersioning/basic.ll
 Transforms/LoopVersioning/bound-check-partially-known.ll
@@ -1739,6 +1296,7 @@ Transforms/PGOProfile/chr-dead-pred.ll
 Transforms/PGOProfile/chr-dup-threshold.ll
 Transforms/PGOProfile/chr.ll
 Transforms/PGOProfile/chr-poison.ll
+Transforms/PGOProfile/chr-lifetimes.ll
 Transforms/PGOProfile/comdat.ll
 Transforms/PGOProfile/cspgo_profile_summary.ll
 Transforms/PGOProfile/memop_profile_funclet_wasm.ll
diff --git a/mlir/docs/Tutorials/Toy/Ch-6.md b/mlir/docs/Tutorials/Toy/Ch-6.md
index 529de55..178c073 100644
--- a/mlir/docs/Tutorials/Toy/Ch-6.md
+++ b/mlir/docs/Tutorials/Toy/Ch-6.md
@@ -245,7 +245,7 @@ define void @main()
 ```
 
 The full code listing for dumping LLVM IR can be found in
-`examples/toy/Ch6/toy.cpp` in the `dumpLLVMIR()` function:
+`examples/toy/Ch6/toyc.cpp` in the `dumpLLVMIR()` function:
 
 ```c++
 
diff --git a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
index 8759f1d..8b687a7 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
@@ -1361,6 +1361,37 @@ def ROCDL_CvtScaleF32PkFp4F32Op :
 }
 
 //===----------------------------------------------------------------------===//
+// FMED3 operations
+//===----------------------------------------------------------------------===//
+
+def ROCDL_FMed3Op : ROCDL_IntrOp<"fmed3", [0], [], [Pure, AllTypesMatch<["res", "src0", "src1", "src2"]>], 1>,
+  Arguments<(ins LLVM_ScalarOrVectorOf<LLVM_AnyFloat>:$src0,
+                 LLVM_ScalarOrVectorOf<LLVM_AnyFloat>:$src1,
+                 LLVM_ScalarOrVectorOf<LLVM_AnyFloat>:$src2)> {
+  let results = (outs LLVM_ScalarOrVectorOf<LLVM_AnyFloat>:$res);
+  let summary = "Median of three float/half values";
+  let description = [{
+    Computes the median of three floating-point values using the AMDGPU fmed3 intrinsic.
+    This operation is equivalent to `max(min(a, b), min(max(a, b), c))` but uses the
+    hardware-accelerated V_MED3_F16/V_MED3_F32 instruction for better performance.
+    
+    The operation supports both scalar and vector floating-point types (f16, f32).
+    
+    Example:
+    ```mlir
+    // Scalar f32 median
+    %result = rocdl.fmed3 %a, %b, %c : f32
+    
+    // Vector f16 median
+    %result = rocdl.fmed3 %va, %vb, %vc : vector<4xf16>
+    ```
+  }];
+  let assemblyFormat = [{
+    $src0 `,` $src1 `,` $src2 attr-dict `:` type($res)
+  }];
+}
+
+//===----------------------------------------------------------------------===//
 // ROCDL target attribute.
 //===----------------------------------------------------------------------===//
 
diff --git a/mlir/include/mlir/Dialect/MemRef/IR/MemRefOps.td b/mlir/include/mlir/Dialect/MemRef/IR/MemRefOps.td
index cd92ca9..2bf953e 100644
--- a/mlir/include/mlir/Dialect/MemRef/IR/MemRefOps.td
+++ b/mlir/include/mlir/Dialect/MemRef/IR/MemRefOps.td
@@ -1241,28 +1241,28 @@ def LoadOp : MemRef_Op<"load",
     OpBuilder<(ins "Value":$memref,
                    "ValueRange":$indices,
                    CArg<"bool", "false">:$nontemporal,
-                   CArg<"uint64_t", "0">:$alignment), [{
+                   CArg<"llvm::MaybeAlign", "llvm::MaybeAlign()">:$alignment), [{
       return build($_builder, $_state, memref, indices, nontemporal,
-                   alignment != 0 ? $_builder.getI64IntegerAttr(alignment) :
-                                    nullptr);
+                   alignment ? $_builder.getI64IntegerAttr(alignment->value()) :
+                               nullptr);
     }]>,
     OpBuilder<(ins "Type":$resultType,
                    "Value":$memref,
                    "ValueRange":$indices,
                    CArg<"bool", "false">:$nontemporal,
-                   CArg<"uint64_t", "0">:$alignment), [{
+                   CArg<"llvm::MaybeAlign", "llvm::MaybeAlign()">:$alignment), [{
       return build($_builder, $_state, resultType, memref, indices, nontemporal,
-                   alignment != 0 ? $_builder.getI64IntegerAttr(alignment) :
-                                    nullptr);
+                   alignment ? $_builder.getI64IntegerAttr(alignment->value()) :
+                               nullptr);
     }]>,
     OpBuilder<(ins "TypeRange":$resultTypes,
                    "Value":$memref,
                    "ValueRange":$indices,
                    CArg<"bool", "false">:$nontemporal,
-                   CArg<"uint64_t", "0">:$alignment), [{
+                   CArg<"llvm::MaybeAlign", "llvm::MaybeAlign()">:$alignment), [{
       return build($_builder, $_state, resultTypes, memref, indices, nontemporal,
-                   alignment != 0 ? $_builder.getI64IntegerAttr(alignment) :
-                                    nullptr);
+                   alignment ? $_builder.getI64IntegerAttr(alignment->value()) :
+                               nullptr);
     }]>
   ];
 
@@ -2007,10 +2007,10 @@ def MemRef_StoreOp : MemRef_Op<"store",
                    "Value":$memref,
                    "ValueRange":$indices,
                    CArg<"bool", "false">:$nontemporal,
-                   CArg<"uint64_t", "0">:$alignment), [{
+                   CArg<"llvm::MaybeAlign", "llvm::MaybeAlign()">:$alignment), [{
       return build($_builder, $_state, valueToStore, memref, indices, nontemporal,
-                   alignment != 0 ? $_builder.getI64IntegerAttr(alignment) :
-                                    nullptr);
+                   alignment ? $_builder.getI64IntegerAttr(alignment->value()) :
+                               nullptr);
     }]>,
     OpBuilder<(ins "Value":$valueToStore, "Value":$memref), [{
       $_state.addOperands(valueToStore);
diff --git a/mlir/include/mlir/Dialect/Tosa/IR/TosaTypesBase.td b/mlir/include/mlir/Dialect/Tosa/IR/TosaTypesBase.td
index 553d69cc..93ab120 100644
--- a/mlir/include/mlir/Dialect/Tosa/IR/TosaTypesBase.td
+++ b/mlir/include/mlir/Dialect/Tosa/IR/TosaTypesBase.td
@@ -282,8 +282,7 @@ def Tosa_Shape : Tosa_Type<"shape", "shape"> {
      !tosa.shape<0>
     ```
   }];
-  let parameters = (ins "int" : $rank);
-  let builders = [TypeBuilder<(ins "int" : $rank)>];
+  let parameters = (ins "int":$rank);
   let assemblyFormat = "`<` $rank `>`";
 
   let genVerifyDecl = 1;
diff --git a/mlir/include/mlir/TableGen/Class.h b/mlir/include/mlir/TableGen/Class.h
index 1034967..e6bedc7 100644
--- a/mlir/include/mlir/TableGen/Class.h
+++ b/mlir/include/mlir/TableGen/Class.h
@@ -789,6 +789,10 @@ public:
         std::forward<Args>(args)...);
   }
 
+  const std::vector<std::unique_ptr<Method>> &getMethods() const {
+    return methods;
+  }
+
   /// Add a new field to the class. Class fields added this way are always
   /// private.
   template <typename TypeT, typename NameT>
diff --git a/mlir/lib/Analysis/DataFlow/LivenessAnalysis.cpp b/mlir/lib/Analysis/DataFlow/LivenessAnalysis.cpp
index fdb97d5..d705d8d4 100644
--- a/mlir/lib/Analysis/DataFlow/LivenessAnalysis.cpp
+++ b/mlir/lib/Analysis/DataFlow/LivenessAnalysis.cpp
@@ -109,19 +109,19 @@ LivenessAnalysis::visitOperation(Operation *op, ArrayRef<Liveness *> operands,
       foundLiveResult = true;
     }
     LDBG() << "[visitOperation] Adding dependency for result: " << r
-           << " after op: " << *op;
+           << " after op: " << OpWithFlags(op, OpPrintingFlags().skipRegions());
     addDependency(const_cast<Liveness *>(r), getProgramPointAfter(op));
   }
   return success();
 }
 
 void LivenessAnalysis::visitBranchOperand(OpOperand &operand) {
+  Operation *op = operand.getOwner();
   LDBG() << "Visiting branch operand: " << operand.get()
-         << " in op: " << *operand.getOwner();
+         << " in op: " << OpWithFlags(op, OpPrintingFlags().skipRegions());
   // We know (at the moment) and assume (for the future) that `operand` is a
   // non-forwarded branch operand of a `RegionBranchOpInterface`,
   // `BranchOpInterface`, `RegionBranchTerminatorOpInterface` or return-like op.
-  Operation *op = operand.getOwner();
   assert((isa<RegionBranchOpInterface>(op) || isa<BranchOpInterface>(op) ||
           isa<RegionBranchTerminatorOpInterface>(op)) &&
          "expected the op to be `RegionBranchOpInterface`, "
@@ -146,12 +146,13 @@ void LivenessAnalysis::visitBranchOperand(OpOperand &operand) {
       // Therefore, if the result value is live, we conservatively consider the
       // non-forwarded operand of the region branch operation with result may
       // live and record all result.
-      for (Value result : op->getResults()) {
+      for (auto [resultIndex, result] : llvm::enumerate(op->getResults())) {
         if (getLatticeElement(result)->isLive) {
           mayLive = true;
-          LDBG() << "[visitBranchOperand] Non-forwarded branch "
-                    "operand may be live due to live result: "
-                 << result;
+          LDBG() << "[visitBranchOperand] Non-forwarded branch operand may be "
+                    "live due to live result #"
+                 << resultIndex << ": "
+                 << OpWithFlags(op, OpPrintingFlags().skipRegions());
           break;
         }
       }
@@ -233,7 +234,8 @@ void LivenessAnalysis::visitBranchOperand(OpOperand &operand) {
   SmallVector<const Liveness *, 4> resultsLiveness;
   for (const Value result : op->getResults())
     resultsLiveness.push_back(getLatticeElement(result));
-  LDBG() << "Visiting operation for non-forwarded branch operand: " << *op;
+  LDBG() << "Visiting operation for non-forwarded branch operand: "
+         << OpWithFlags(op, OpPrintingFlags().skipRegions());
   (void)visitOperation(op, operandLiveness, resultsLiveness);
 
   // We also visit the parent op with the parent's results and this operand if
diff --git a/mlir/lib/Bindings/Python/Rewrite.cpp b/mlir/lib/Bindings/Python/Rewrite.cpp
index 20392b9..f18298e 100644
--- a/mlir/lib/Bindings/Python/Rewrite.cpp
+++ b/mlir/lib/Bindings/Python/Rewrite.cpp
@@ -155,7 +155,7 @@ void mlir::python::populateRewriteSubmodule(nb::module_ &m) {
             mlirPDLResultListPushBackValue(results, value);
           },
           // clang-format off
-          nb::sig("def append(self, " MAKE_MLIR_PYTHON_QUALNAME("ir.Value") ")")
+          nb::sig("def append(self, value: " MAKE_MLIR_PYTHON_QUALNAME("ir.Value") ")")
           // clang-format on
           )
       .def(
@@ -164,7 +164,7 @@ void mlir::python::populateRewriteSubmodule(nb::module_ &m) {
             mlirPDLResultListPushBackOperation(results, op);
           },
           // clang-format off
-          nb::sig("def append(self, " MAKE_MLIR_PYTHON_QUALNAME("ir.Operation") ")")
+          nb::sig("def append(self, op: " MAKE_MLIR_PYTHON_QUALNAME("ir.Operation") ")")
           // clang-format on
           )
       .def(
@@ -173,7 +173,7 @@ void mlir::python::populateRewriteSubmodule(nb::module_ &m) {
             mlirPDLResultListPushBackType(results, type);
           },
           // clang-format off
-          nb::sig("def append(self, " MAKE_MLIR_PYTHON_QUALNAME("ir.Type") ")")
+          nb::sig("def append(self, type: " MAKE_MLIR_PYTHON_QUALNAME("ir.Type") ")")
           // clang-format on
           )
       .def(
@@ -182,7 +182,7 @@ void mlir::python::populateRewriteSubmodule(nb::module_ &m) {
             mlirPDLResultListPushBackAttribute(results, attr);
           },
           // clang-format off
-          nb::sig("def append(self, " MAKE_MLIR_PYTHON_QUALNAME("ir.Attribute") ")")
+          nb::sig("def append(self, attr: " MAKE_MLIR_PYTHON_QUALNAME("ir.Attribute") ")")
           // clang-format on
       );
   nb::class_<PyPDLPatternModule>(m, "PDLModule")
diff --git a/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp
index 1037e29..a73afbc 100644
--- a/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp
+++ b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp
@@ -663,7 +663,7 @@ static IntegerAttr wrapNumericMemorySpace(MLIRContext *ctx, unsigned space) {
 
 /// Generates a symbol with 0-sized array type for dynamic shared memory usage,
 /// or uses existing symbol.
-LLVM::GlobalOp getDynamicSharedMemorySymbol(
+static LLVM::GlobalOp getDynamicSharedMemorySymbol(
     ConversionPatternRewriter &rewriter, gpu::GPUModuleOp moduleOp,
     gpu::DynamicSharedMemoryOp op, const LLVMTypeConverter *typeConverter,
     MemRefType memrefType, unsigned alignmentBit) {
diff --git a/mlir/lib/Conversion/LLVMCommon/PrintCallHelper.cpp b/mlir/lib/Conversion/LLVMCommon/PrintCallHelper.cpp
index d95aeba..da4443d 100644
--- a/mlir/lib/Conversion/LLVMCommon/PrintCallHelper.cpp
+++ b/mlir/lib/Conversion/LLVMCommon/PrintCallHelper.cpp
@@ -67,7 +67,7 @@ LogicalResult mlir::LLVM::createPrintStrCall(
   auto arrayTy =
       LLVM::LLVMArrayType::get(IntegerType::get(ctx, 8), elementVals.size());
   auto globalOp = LLVM::GlobalOp::create(
-      builder, loc, arrayTy, /*constant=*/true, LLVM::Linkage::Private,
+      builder, loc, arrayTy, /*isConstant=*/true, LLVM::Linkage::Private,
       ensureSymbolNameIsUnique(moduleOp, symbolName, symbolTables), dataAttr);
 
   auto ptrTy = LLVM::LLVMPointerType::get(builder.getContext());
diff --git a/mlir/lib/Conversion/MemRefToLLVM/MemRefToLLVM.cpp b/mlir/lib/Conversion/MemRefToLLVM/MemRefToLLVM.cpp
index 262e0e7..cc6314c 100644
--- a/mlir/lib/Conversion/MemRefToLLVM/MemRefToLLVM.cpp
+++ b/mlir/lib/Conversion/MemRefToLLVM/MemRefToLLVM.cpp
@@ -48,8 +48,8 @@ static bool isStaticStrideOrOffset(int64_t strideOrOffset) {
 }
 
 static FailureOr<LLVM::LLVMFuncOp>
-getFreeFn(OpBuilder &b, const LLVMTypeConverter *typeConverter, ModuleOp module,
-          SymbolTableCollection *symbolTables) {
+getFreeFn(OpBuilder &b, const LLVMTypeConverter *typeConverter,
+          Operation *module, SymbolTableCollection *symbolTables) {
   bool useGenericFn = typeConverter->getOptions().useGenericFunctions;
 
   if (useGenericFn)
@@ -483,8 +483,8 @@ public:
                   ConversionPatternRewriter &rewriter) const override {
     // Insert the `free` declaration if it is not already present.
     FailureOr<LLVM::LLVMFuncOp> freeFunc =
-        getFreeFn(rewriter, getTypeConverter(), op->getParentOfType<ModuleOp>(),
-                  symbolTables);
+        getFreeFn(rewriter, getTypeConverter(),
+                  op->getParentWithTrait<OpTrait::SymbolTable>(), symbolTables);
     if (failed(freeFunc))
       return failure();
     Value allocatedPtr;
diff --git a/mlir/lib/Conversion/VectorToGPU/VectorToGPU.cpp b/mlir/lib/Conversion/VectorToGPU/VectorToGPU.cpp
index 79cb49a..d6a2622 100644
--- a/mlir/lib/Conversion/VectorToGPU/VectorToGPU.cpp
+++ b/mlir/lib/Conversion/VectorToGPU/VectorToGPU.cpp
@@ -741,7 +741,7 @@ creatLdMatrixCompatibleLoads(RewriterBase &rewriter, vector::TransferReadOp op,
   }
 
   // Adjust the load offset.
-  auto laneId = gpu::LaneIdOp::create(rewriter, loc, /*upperBound=*/nullptr);
+  auto laneId = gpu::LaneIdOp::create(rewriter, loc, /*upper_bound=*/nullptr);
   FailureOr<AffineMap> offsets =
       nvgpu::getLaneIdToLdMatrixMatrixCoord(rewriter, loc, *params);
   if (failed(offsets)) {
@@ -781,7 +781,7 @@ createNonLdMatrixLoads(RewriterBase &rewriter, vector::TransferReadOp op,
             "conversion to distributed non-ldmatrix compatible load");
   }
 
-  Value laneId = gpu::LaneIdOp::create(rewriter, loc, /*upperBound=*/nullptr);
+  Value laneId = gpu::LaneIdOp::create(rewriter, loc, /*upper_bound=*/nullptr);
 
   // This is the individual element type.
   Type loadedElType = regInfo->registerLLVMType;
@@ -915,7 +915,7 @@ convertTransferWriteToStores(RewriterBase &rewriter, vector::TransferWriteOp op,
     return rewriter.notifyMatchFailure(op, "not mma sync reg info");
 
   VectorType vectorType = getMmaSyncVectorOperandType(*regInfo);
-  Value laneId = gpu::LaneIdOp::create(rewriter, loc, /*upperBound=*/nullptr);
+  Value laneId = gpu::LaneIdOp::create(rewriter, loc, /*upper_bound=*/nullptr);
 
   for (unsigned i = 0; i < vectorType.getShape()[0]; i++) {
     Value logicalValueId = arith::ConstantOp::create(
diff --git a/mlir/lib/Dialect/ControlFlow/IR/ControlFlowOps.cpp b/mlir/lib/Dialect/ControlFlow/IR/ControlFlowOps.cpp
index 582593a..f1da1a1 100644
--- a/mlir/lib/Dialect/ControlFlow/IR/ControlFlowOps.cpp
+++ b/mlir/lib/Dialect/ControlFlow/IR/ControlFlowOps.cpp
@@ -122,6 +122,16 @@ static LogicalResult collapseBranch(Block *&successor,
   Block *successorDest = successorBranch.getDest();
   if (successorDest == successor)
     return failure();
+  // Don't try to collapse branches which participate in a cycle.
+  BranchOp nextBranch = dyn_cast<BranchOp>(successorDest->getTerminator());
+  llvm::DenseSet<Block *> visited{successor, successorDest};
+  while (nextBranch) {
+    Block *nextBranchDest = nextBranch.getDest();
+    if (visited.contains(nextBranchDest))
+      return failure();
+    visited.insert(nextBranchDest);
+    nextBranch = dyn_cast<BranchOp>(nextBranchDest->getTerminator());
+  }
 
   // Update the operands to the successor. If the branch parent has no
   // arguments, we can use the branch operands directly.
diff --git a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
index b45fdf3..81c3069 100644
--- a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
@@ -430,7 +430,7 @@ createSubgroupDPPReduction(PatternRewriter &rewriter, gpu::SubgroupReduceOp op,
       dpp = ROCDL::PermlaneX16Op::create(rewriter, loc, res.getType(), res, res,
                                          uint32Max, uint32Max,
                                          /*fi=*/true,
-                                         /*bound_ctrl=*/false);
+                                         /*boundControl=*/false);
       res = vector::makeArithReduction(
           rewriter, loc, gpu::convertReductionKind(mode), res, dpp);
     } else {
diff --git a/mlir/lib/Dialect/Linalg/Transforms/PadTilingInterface.cpp b/mlir/lib/Dialect/Linalg/Transforms/PadTilingInterface.cpp
index 8942670..0956c5d 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/PadTilingInterface.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/PadTilingInterface.cpp
@@ -141,7 +141,7 @@ SmallVector<OpFoldResult> linalg::computePaddedShape(
       projectedDims.flip(paddingDim);
       AffineMap projectedMap =
           mlir::projectDims(partialIndexingMap, projectedDims,
-                            /*compressDims=*/true);
+                            /*compressDimsFlag=*/true);
 
       // If we are padding to the next multiple of, compose with ceil(sz) * sz.
       OpFoldResult paddingDimOfr;
diff --git a/mlir/lib/Dialect/MemRef/Utils/MemRefUtils.cpp b/mlir/lib/Dialect/MemRef/Utils/MemRefUtils.cpp
index 3de9c38..6200366 100644
--- a/mlir/lib/Dialect/MemRef/Utils/MemRefUtils.cpp
+++ b/mlir/lib/Dialect/MemRef/Utils/MemRefUtils.cpp
@@ -191,7 +191,7 @@ computeSuffixProductIRBlock(Location loc, OpBuilder &builder,
 }
 
 MemrefValue skipFullyAliasingOperations(MemrefValue source) {
-  while (auto op = source.getDefiningOp()) {
+  while (auto *op = source.getDefiningOp()) {
     if (auto subViewOp = dyn_cast<memref::SubViewOp>(op);
         subViewOp && subViewOp.hasZeroOffset() && subViewOp.hasUnitStride()) {
       // A `memref.subview` with an all zero offset, and all unit strides, still
@@ -208,7 +208,7 @@ MemrefValue skipFullyAliasingOperations(MemrefValue source) {
 }
 
 MemrefValue skipViewLikeOps(MemrefValue source) {
-  while (auto op = source.getDefiningOp()) {
+  while (auto *op = source.getDefiningOp()) {
     if (auto viewLike = dyn_cast<ViewLikeOpInterface>(op)) {
       if (source == viewLike.getViewDest()) {
         source = cast<MemrefValue>(viewLike.getViewSource());
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/LoopEmitter.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/LoopEmitter.cpp
index 659282a..f539502 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/LoopEmitter.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/LoopEmitter.cpp
@@ -344,7 +344,7 @@ void LoopEmitter::initSubSectIterator(OpBuilder &builder, Location loc) {
       // Reverse queue into a stack.
       std::reverse(remDepStack[t][lvl].begin(), remDepStack[t][lvl].end());
       for (auto [loop, coeff] : dependentLvlMap[t][lvl])
-        depRedOrder.emplace_back(std::make_tuple(loop, t, lvl));
+        depRedOrder.emplace_back(loop, t, lvl);
     }
 
     if (depRedOrder.empty())
diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorEmulateMaskedLoadStore.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorEmulateMaskedLoadStore.cpp
index 78f74ee..bdbb792 100644
--- a/mlir/lib/Dialect/Vector/Transforms/VectorEmulateMaskedLoadStore.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/VectorEmulateMaskedLoadStore.cpp
@@ -64,7 +64,6 @@ struct VectorMaskedLoadOpConverter final
     Value mask = maskedLoadOp.getMask();
     Value base = maskedLoadOp.getBase();
     Value iValue = maskedLoadOp.getPassThru();
-    std::optional<uint64_t> alignment = maskedLoadOp.getAlignment();
     auto indices = llvm::to_vector_of<Value>(maskedLoadOp.getIndices());
     Value one = arith::ConstantOp::create(rewriter, loc, indexType,
                                           IntegerAttr::get(indexType, 1));
@@ -76,7 +75,7 @@ struct VectorMaskedLoadOpConverter final
           [&](OpBuilder &builder, Location loc) {
             auto loadedValue = memref::LoadOp::create(
                 builder, loc, base, indices, /*nontemporal=*/false,
-                alignment.value_or(0));
+                llvm::MaybeAlign(maskedLoadOp.getAlignment().value_or(0)));
             auto combinedValue =
                 vector::InsertOp::create(builder, loc, loadedValue, iValue, i);
             scf::YieldOp::create(builder, loc, combinedValue.getResult());
@@ -135,7 +134,6 @@ struct VectorMaskedStoreOpConverter final
     Value base = maskedStoreOp.getBase();
     Value value = maskedStoreOp.getValueToStore();
     bool nontemporal = false;
-    std::optional<uint64_t> alignment = maskedStoreOp.getAlignment();
     auto indices = llvm::to_vector_of<Value>(maskedStoreOp.getIndices());
     Value one = arith::ConstantOp::create(rewriter, loc, indexType,
                                           IntegerAttr::get(indexType, 1));
@@ -145,8 +143,9 @@ struct VectorMaskedStoreOpConverter final
       auto ifOp = scf::IfOp::create(rewriter, loc, maskBit, /*else=*/false);
       rewriter.setInsertionPointToStart(&ifOp.getThenRegion().front());
       auto extractedValue = vector::ExtractOp::create(rewriter, loc, value, i);
-      memref::StoreOp::create(rewriter, loc, extractedValue, base, indices,
-                              nontemporal, alignment.value_or(0));
+      memref::StoreOp::create(
+          rewriter, loc, extractedValue, base, indices, nontemporal,
+          llvm::MaybeAlign(maskedStoreOp.getAlignment().value_or(0)));
 
       rewriter.setInsertionPointAfter(ifOp);
       indices.back() =
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
index 20608f9..81b5788 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
@@ -23,7 +23,7 @@
 namespace mlir {
 namespace xegpu {
 
-bool isSharedMemory(const MemRefType &memrefTy) {
+static bool isSharedMemory(const MemRefType &memrefTy) {
   Attribute attr = memrefTy.getMemorySpace();
   if (auto intAttr = llvm::dyn_cast<IntegerAttr>(attr))
     return intAttr.getInt() == 3;
@@ -340,7 +340,7 @@ LogicalResult CreateNdDescOp::verify() {
   return success();
 }
 
-ParseResult parseOptionalDynamicIndexList(
+static ParseResult parseOptionalDynamicIndexList(
     OpAsmParser &parser,
     SmallVectorImpl<OpAsmParser::UnresolvedOperand> &values,
     DenseI64ArrayAttr &integers, SmallVectorImpl<Type> *valueTypes = nullptr,
@@ -378,9 +378,9 @@ ParseResult parseOptionalDynamicIndexList(
   return success();
 }
 
-void printOptionalDynamicIndexList(OpAsmPrinter &printer, Operation *op,
-                                   OperandRange values,
-                                   DenseI64ArrayAttr integers) {
+static void printOptionalDynamicIndexList(OpAsmPrinter &printer, Operation *op,
+                                          OperandRange values,
+                                          DenseI64ArrayAttr integers) {
   if (!integers || integers.empty())
     return;
   printDynamicIndexList(printer, op, values, integers,
diff --git a/mlir/lib/ExecutionEngine/JitRunner.cpp b/mlir/lib/ExecutionEngine/JitRunner.cpp
index 0ada4cc..db05165 100644
--- a/mlir/lib/ExecutionEngine/JitRunner.cpp
+++ b/mlir/lib/ExecutionEngine/JitRunner.cpp
@@ -271,7 +271,7 @@ Error checkCompatibleReturnType<float>(LLVM::LLVMFuncOp mainFunction) {
   return Error::success();
 }
 template <typename Type>
-Error compileAndExecuteSingleReturnFunction(
+static Error compileAndExecuteSingleReturnFunction(
     Options &options, Operation *module, StringRef entryPoint,
     CompileAndExecuteConfig config, std::unique_ptr<llvm::TargetMachine> tm) {
   auto mainFunction = dyn_cast_or_null<LLVM::LLVMFuncOp>(
diff --git a/mlir/lib/Transforms/Mem2Reg.cpp b/mlir/lib/Transforms/Mem2Reg.cpp
index d36a3c1..b305712 100644
--- a/mlir/lib/Transforms/Mem2Reg.cpp
+++ b/mlir/lib/Transforms/Mem2Reg.cpp
@@ -286,7 +286,7 @@ LogicalResult MemorySlotPromotionAnalyzer::computeBlockingUses(
   mlir::getForwardSlice(slot.ptr, &forwardSlice);
   for (Operation *user : forwardSlice) {
     // If the next operation has no blocking uses, everything is fine.
-    auto it = userToBlockingUses.find(user);
+    auto *it = userToBlockingUses.find(user);
     if (it == userToBlockingUses.end())
       continue;
 
diff --git a/mlir/test/Dialect/ControlFlow/canonicalize.mlir b/mlir/test/Dialect/ControlFlow/canonicalize.mlir
index bf69935..17f7d28 100644
--- a/mlir/test/Dialect/ControlFlow/canonicalize.mlir
+++ b/mlir/test/Dialect/ControlFlow/canonicalize.mlir
@@ -490,3 +490,147 @@ func.func @branchCondProp(%arg0: i1) {
 ^exit:
   return
 }
+
+// -----
+
+/// Test that control-flow cycles are not simplified infinitely.
+
+// CHECK-LABEL:   @cycle_2_blocks
+// CHECK:           cf.br ^bb1
+// CHECK:         ^bb1:
+// CHECK:           cf.br ^bb1
+func.func @cycle_2_blocks() {
+  cf.br ^bb1
+^bb1:
+  cf.br ^bb2
+^bb2:
+  cf.br ^bb1
+}
+
+// CHECK-LABEL:   @no_cycle_2_blocks
+// CHECK:           %[[VAL_0:.*]] = arith.constant 1 : i32
+// CHECK:           return %[[VAL_0]] : i32
+func.func @no_cycle_2_blocks() -> i32 {
+  cf.br ^bb1
+^bb1:
+  cf.br ^bb2
+^bb2:
+  cf.br ^bb3
+^bb3:
+  %ret = arith.constant 1 : i32
+  return %ret : i32
+}
+
+// CHECK-LABEL:   @cycle_4_blocks
+// CHECK:           cf.br ^bb1
+// CHECK:         ^bb1:
+// CHECK:           cf.br ^bb1
+func.func @cycle_4_blocks() {
+  cf.br ^bb1
+^bb1:
+  cf.br ^bb2
+^bb2:
+  cf.br ^bb3
+^bb3:
+  cf.br ^bb4
+^bb4:
+  cf.br ^bb1
+}
+
+// CHECK-LABEL:   @no_cycle_4_blocks
+// CHECK:           %[[VAL_0:.*]] = arith.constant 1 : i32
+// CHECK:           return %[[VAL_0]] : i32
+func.func @no_cycle_4_blocks() -> i32 {
+  cf.br ^bb1
+^bb1:
+  cf.br ^bb2
+^bb2:
+  cf.br ^bb3
+^bb3:
+  cf.br ^bb4
+^bb4:
+  cf.br ^bb5
+^bb5:
+  %ret = arith.constant 1 : i32
+  return %ret : i32
+}
+
+// CHECK-LABEL:   @delayed_3_cycle
+// CHECK:           cf.br ^bb1
+// CHECK:         ^bb1:
+// CHECK:           cf.br ^bb1
+func.func @delayed_3_cycle() {
+  cf.br ^bb1
+^bb1:
+  cf.br ^bb2
+^bb2:
+  cf.br ^bb3
+^bb3:
+  cf.br ^bb4
+^bb4:
+  cf.br ^bb5
+^bb5:
+  cf.br ^bb3
+}
+
+// CHECK-LABEL:   @cycle_1_block
+// CHECK:           cf.br ^bb1
+// CHECK:         ^bb1:
+// CHECK:           cf.br ^bb1
+func.func @cycle_1_block() {
+  cf.br ^bb1
+^bb1:
+  cf.br ^bb2
+^bb2:
+  cf.br ^bb2
+}
+
+// CHECK-LABEL:   @unsimplified_cycle_1
+// CHECK-SAME:      %[[ARG0:.*]]: i1) {
+// CHECK:           cf.cond_br %[[ARG0]], ^bb1, ^bb2
+// CHECK:         ^bb1:
+// CHECK:           cf.br ^bb2
+// CHECK:         ^bb2:
+// CHECK:           cf.br ^bb3
+// CHECK:         ^bb3:
+// CHECK:           cf.br ^bb3
+func.func @unsimplified_cycle_1(%c : i1) {
+  cf.cond_br %c, ^bb1, ^bb2
+^bb1:
+  cf.br ^bb2
+^bb2:
+  cf.br ^bb3
+^bb3:
+  cf.br ^bb4
+^bb4:
+  cf.br ^bb3
+}
+
+// Make sure we terminate when other cf passes can't help us.
+
+// CHECK-LABEL:   @unsimplified_cycle_2
+// CHECK-SAME:      %[[ARG0:.*]]: i1) {
+// CHECK:           cf.cond_br %[[ARG0]], ^bb1, ^bb3
+// CHECK:         ^bb1:
+// CHECK:           cf.br ^bb2 {A}
+// CHECK:         ^bb2:
+// CHECK:           cf.br ^bb2 {E}
+// CHECK:         ^bb3:
+// CHECK:           cf.br ^bb1
+func.func @unsimplified_cycle_2(%c : i1) {
+  cf.cond_br %c, ^bb6, ^bb7
+^bb6:
+  cf.br ^bb5 {F}
+^bb5:
+  cf.br ^bb1 {A}
+^bb1:
+  cf.br ^bb2 {B}
+^bb2:
+  cf.br ^bb3 {C}
+^bb3:
+  cf.br ^bb4 {D}
+^bb4:
+  cf.br ^bb1 {E}
+^bb7:
+  cf.br ^bb6
+}
diff --git a/mlir/test/Dialect/GPU/memref-to-llvm.mlir b/mlir/test/Dialect/GPU/memref-to-llvm.mlir
new file mode 100644
index 0000000..81a96bf
--- /dev/null
+++ b/mlir/test/Dialect/GPU/memref-to-llvm.mlir
@@ -0,0 +1,33 @@
+// RUN: mlir-opt --convert-to-llvm %s | FileCheck %s
+
+// Checking that malloc and free are declared in the proper module.
+
+// CHECK: module attributes {gpu.container_module} {
+// CHECK:   llvm.func @free(!llvm.ptr)
+// CHECK:   llvm.func @malloc(i64) -> !llvm.ptr
+// CHECK:   gpu.module @kernels {
+// CHECK:     llvm.func @free(!llvm.ptr)
+// CHECK:     llvm.func @malloc(i64) -> !llvm.ptr
+// CHECK:     gpu.func @kernel_1
+// CHECK:       llvm.call @malloc({{.*}}) : (i64) -> !llvm.ptr
+// CHECK:       llvm.call @free({{.*}}) : (!llvm.ptr) -> ()
+// CHECK:       gpu.return
+// CHECK:     }
+// CHECK:   }
+// CHECK: }
+module attributes {gpu.container_module} {
+
+  gpu.module @kernels {
+    gpu.func @kernel_1() kernel {
+      %memref_a = memref.alloc() : memref<8x16xf32>
+      memref.dealloc %memref_a : memref<8x16xf32>
+      gpu.return
+    }
+  }
+
+  func.func @main() {
+    %memref_a = memref.alloc() : memref<8x16xf32>
+    memref.dealloc %memref_a : memref<8x16xf32>
+    return
+  }
+}
diff --git a/mlir/test/Dialect/LLVMIR/rocdl.mlir b/mlir/test/Dialect/LLVMIR/rocdl.mlir
index e127fdb..0bad151 100644
--- a/mlir/test/Dialect/LLVMIR/rocdl.mlir
+++ b/mlir/test/Dialect/LLVMIR/rocdl.mlir
@@ -29,6 +29,20 @@ func.func @rocdl_special_regs() -> i32 {
   llvm.return %0 : i32
 }
 
+func.func @rocdl.fmed3.scalar(%a: f32, %b: f32, %c: f32) -> f32 {
+  // CHECK-LABEL: rocdl.fmed3.scalar
+  // CHECK: %0 = rocdl.fmed3 %arg0, %arg1, %arg2 : f32
+  %0 = rocdl.fmed3 %a, %b, %c : f32
+  llvm.return %0 : f32
+}
+
+func.func @rocdl.fmed3.vector(%a: vector<4xf16>, %b: vector<4xf16>, %c: vector<4xf16>) -> vector<4xf16> {
+  // CHECK-LABEL: rocdl.fmed3.vector
+  // CHECK: %0 = rocdl.fmed3 %arg0, %arg1, %arg2 : vector<4xf16>
+  %0 = rocdl.fmed3 %a, %b, %c : vector<4xf16>
+  llvm.return %0 : vector<4xf16>
+}
+
 func.func @rocdl.barrier() {
   // CHECK: rocdl.barrier
   rocdl.barrier
diff --git a/mlir/test/Target/LLVMIR/rocdl.mlir b/mlir/test/Target/LLVMIR/rocdl.mlir
index c629877..e043a8c 100644
--- a/mlir/test/Target/LLVMIR/rocdl.mlir
+++ b/mlir/test/Target/LLVMIR/rocdl.mlir
@@ -1298,6 +1298,20 @@ llvm.func @rocdl_last_use(%ptr: !llvm.ptr<1>) -> i32 {
   llvm.return %ret : i32
 }
 
+llvm.func @test_fmed3_f16(%arg0: f16, %arg1: f16, %arg2: f16) -> f16 {
+  // CHECK-LABEL: define half @test_fmed3_f16(half %0, half %1, half %2)
+  %0 = rocdl.fmed3 %arg0, %arg1, %arg2 : f16
+  llvm.return %0 : f16
+  // CHECK: call half @llvm.amdgcn.fmed3.f16(half %0, half %1, half %2)
+}
+
+llvm.func @test_fmed3_f32(%arg0: f32, %arg1: f32, %arg2: f32) -> f32 {
+  // CHECK-LABEL: define float @test_fmed3_f32(float %0, float %1, float %2)
+  %0 = rocdl.fmed3 %arg0, %arg1, %arg2 : f32
+  llvm.return %0 : f32
+  // CHECK: call float @llvm.amdgcn.fmed3.f32(float %0, float %1, float %2)
+}
+
 // CHECK-LABEL: rocdl.cvt.scale.pk8
 // CHECK-SAME:(i32 %[[I32:.+]], <2 x i32> %[[V2I32:.+]], i32 %[[SCALE:.+]])
 llvm.func @rocdl.cvt.scale.pk8(%i32: i32, %v2xi32: vector<2xi32>, %scale: i32) {
diff --git a/mlir/test/mlir-tblgen/attr-duplicated-builder-error.td b/mlir/test/mlir-tblgen/attr-duplicated-builder-error.td
new file mode 100644
index 0000000..5f1c61a
--- /dev/null
+++ b/mlir/test/mlir-tblgen/attr-duplicated-builder-error.td
@@ -0,0 +1,48 @@
+// RUN: not mlir-tblgen -gen-attrdef-decls -I %S/../../include %s 2>&1 | FileCheck %s
+
+include "mlir/IR/OpBase.td"
+
+def Test_Dialect : Dialect {
+  let name = "test";
+  let cppNamespace = "::test";
+}
+
+class TestAttr<string attrName, string attrMnemonic, list<Trait> traits = []>
+    : AttrDef<Test_Dialect, attrName, traits> {
+  let mnemonic = attrMnemonic;
+}
+
+def TestAttr : TestAttr<"Test", "test"> {
+  let summary = "Test attrubute";
+  let description = "Test attribute";
+
+  let parameters = (ins AttrParameter<"std::int64_t", "arg">:$arg);
+  let builders = [AttrBuilder<(ins "std::int64_t":$arg), [{
+            return $_get($_ctxt, arg);
+        }]>];
+
+  let assemblyFormat = "`<` $arg `>`";
+
+  let skipDefaultBuilders = 0;
+  let genVerifyDecl = 1;
+  let genMnemonicAlias = 1;
+}
+
+def Test_TestAttrOp : Op<Test_Dialect, "test", []> {
+  let summary = "test operation with attribute";
+  let description = "test operation with attribute";
+
+  let arguments = (ins TestAttr:$testAttr);
+  let assemblyFormat = "$testAttr attr-dict";
+}
+
+// CHECK: attr-duplicated-builder-error.td:20:7: error: builder `get` conflicts with an existing builder. 
+// CHECK-NEXT:   let builders = [AttrBuilder<(ins "std::int64_t":$arg), [{
+// CHECK-NEXT:       ^
+// CHECK-NEXT: note: A new builder with signature:
+// CHECK-NEXT: static TestAttr get(::mlir::MLIRContext *context, std::int64_t arg);
+// CHECK-EMPTY:
+// CHECK-NEXT: is shadowed by an existing builder with signature:
+// CHECK-NEXT: static TestAttr get(::mlir::MLIRContext *context, std::int64_t arg);
+// CHECK-EMPTY:
+// CHECK-NEXT: Please remove one of the conflicting definitions.
diff --git a/mlir/test/mlir-tblgen/attr-duplicated-custom-builders-error.td b/mlir/test/mlir-tblgen/attr-duplicated-custom-builders-error.td
new file mode 100644
index 0000000..0e09f66
--- /dev/null
+++ b/mlir/test/mlir-tblgen/attr-duplicated-custom-builders-error.td
@@ -0,0 +1,52 @@
+// RUN: not mlir-tblgen -gen-attrdef-decls -I %S/../../include %s 2>&1 | FileCheck %s
+
+include "mlir/IR/OpBase.td"
+
+def Test_Dialect : Dialect {
+  let name = "test";
+  let cppNamespace = "::test";
+}
+
+class TestAttr<string attrName, string attrMnemonic, list<Trait> traits = []>
+    : AttrDef<Test_Dialect, attrName, traits> {
+  let mnemonic = attrMnemonic;
+}
+
+def TestAttr : TestAttr<"Test", "test"> {
+  let summary = "Test attrubute";
+  let description = "Test attribute";
+
+  let parameters = (ins AttrParameter<"std::int64_t", "arg">:$arg);
+  let builders = [AttrBuilder<(ins "std::int64_t":$arg), [{
+            return $_get($_ctxt, arg);
+        }]>,
+                  AttrBuilder<(ins "std::int64_t":$arg), [{
+            // Duplicated builder
+            return $_get($_ctxt, arg);
+        }]>];
+
+  let assemblyFormat = "`<` $arg `>`";
+
+  let skipDefaultBuilders = 1;
+  let genVerifyDecl = 1;
+  let genMnemonicAlias = 1;
+}
+
+def Test_TestAttrOp : Op<Test_Dialect, "test", []> {
+  let summary = "test operation with attribute";
+  let description = "test operation with attribute";
+
+  let arguments = (ins TestAttr:$testAttr);
+  let assemblyFormat = "$testAttr attr-dict";
+}
+
+// CHECK: attr-duplicated-custom-builders-error.td:20:7: error: builder `get` conflicts with an existing builder.
+// CHECK-NEXT:   let builders = [AttrBuilder<(ins "std::int64_t":$arg), [{
+// CHECK-NEXT:   ^
+// CHECK-NEXT: note: A new builder with signature:
+// CHECK-NEXT: static TestAttr get(::mlir::MLIRContext *context, std::int64_t arg);
+// CHECK-EMPTY:
+// CHECK-NEXT: is shadowed by an existing builder with signature:
+// CHECK-NEXT: static TestAttr get(::mlir::MLIRContext *context, std::int64_t arg);
+// CHECK-EMPTY:
+// CHECK-NEXT: Please remove one of the conflicting definitions.
diff --git a/mlir/tools/mlir-tblgen/AttrOrTypeDefGen.cpp b/mlir/tools/mlir-tblgen/AttrOrTypeDefGen.cpp
index 3140f12..b911565 100644
--- a/mlir/tools/mlir-tblgen/AttrOrTypeDefGen.cpp
+++ b/mlir/tools/mlir-tblgen/AttrOrTypeDefGen.cpp
@@ -513,14 +513,57 @@ getCustomBuilderParams(std::initializer_list<MethodParameter> prefix,
   return builderParams;
 }
 
+static std::string getSignature(const Method &m) {
+  std::string signature;
+  llvm::raw_string_ostream os(signature);
+  raw_indented_ostream indentedOs(os);
+  m.writeDeclTo(indentedOs);
+  return signature;
+}
+
+static void emitDuplicatedBuilderError(const Method &currentMethod,
+                                       StringRef methodName,
+                                       const Class &defCls,
+                                       const AttrOrTypeDef &def) {
+
+  // Try to search for method that makes `get` redundant.
+  auto loc = def.getDef()->getFieldLoc("builders");
+  for (auto &method : defCls.getMethods()) {
+    if (method->getName() == methodName &&
+        method->makesRedundant(currentMethod)) {
+      PrintError(loc, llvm::Twine("builder `") + methodName +
+                          "` conflicts with an existing builder. ");
+      PrintFatalNote(llvm::Twine("A new builder with signature:\n") +
+                     getSignature(currentMethod) +
+                     "\nis shadowed by an existing builder with signature:\n" +
+                     getSignature(*method) +
+                     "\nPlease remove one of the conflicting "
+                     "definitions.");
+    }
+  }
+
+  // This code shouldn't be reached, but leaving this here for potential future
+  // use.
+  PrintFatalError(loc, "Failed to generate builder " + methodName);
+}
+
 void DefGen::emitCustomBuilder(const AttrOrTypeBuilder &builder) {
   // Don't emit a body if there isn't one.
   auto props = builder.getBody() ? Method::Static : Method::StaticDeclaration;
   StringRef returnType = def.getCppClassName();
   if (std::optional<StringRef> builderReturnType = builder.getReturnType())
     returnType = *builderReturnType;
-  Method *m = defCls.addMethod(returnType, "get", props,
-                               getCustomBuilderParams({}, builder));
+
+  llvm::StringRef methodName = "get";
+  const auto parameters = getCustomBuilderParams({}, builder);
+  Method *m = defCls.addMethod(returnType, methodName, props, parameters);
+
+  // If method is pruned, report error and terminate.
+  if (!m) {
+    auto curMethod = Method(returnType, methodName, props, parameters);
+    emitDuplicatedBuilderError(curMethod, methodName, defCls, def);
+  }
+
   if (!builder.getBody())
     return;
 
@@ -547,11 +590,19 @@ void DefGen::emitCheckedCustomBuilder(const AttrOrTypeBuilder &builder) {
   StringRef returnType = def.getCppClassName();
   if (std::optional<StringRef> builderReturnType = builder.getReturnType())
     returnType = *builderReturnType;
-  Method *m = defCls.addMethod(
-      returnType, "getChecked", props,
-      getCustomBuilderParams(
-          {{"::llvm::function_ref<::mlir::InFlightDiagnostic()>", "emitError"}},
-          builder));
+
+  llvm::StringRef methodName = "getChecked";
+  auto parameters = getCustomBuilderParams(
+      {{"::llvm::function_ref<::mlir::InFlightDiagnostic()>", "emitError"}},
+      builder);
+  Method *m = defCls.addMethod(returnType, methodName, props, parameters);
+
+  // If method is pruned, report error and terminate.
+  if (!m) {
+    auto curMethod = Method(returnType, methodName, props, parameters);
+    emitDuplicatedBuilderError(curMethod, methodName, defCls, def);
+  }
+
   if (!builder.getBody())
     return;
 
diff --git a/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp b/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp
index 4fdde76..7e8e559 100644
--- a/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp
+++ b/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp
@@ -3104,8 +3104,8 @@ void OpEmitter::genBuilder() {
     std::optional<StringRef> body = builder.getBody();
     auto properties = body ? Method::Static : Method::StaticDeclaration;
     auto *method = opClass.addMethod("void", "build", properties, arguments);
-    if (body)
-      ERROR_IF_PRUNED(method, "build", op);
+
+    ERROR_IF_PRUNED(method, "build", op);
 
     if (method)
       method->setDeprecated(builder.getDeprecatedMessage());
diff --git a/offload/DeviceRTL/CMakeLists.txt b/offload/DeviceRTL/CMakeLists.txt
deleted file mode 100644
index e4916f4..0000000
--- a/offload/DeviceRTL/CMakeLists.txt
+++ /dev/null
@@ -1,188 +0,0 @@
-set(LIBOMPTARGET_BUILD_DEVICERTL_BCLIB TRUE CACHE BOOL
-  "Can be set to false to disable building this library.")
-
-if (NOT LIBOMPTARGET_BUILD_DEVICERTL_BCLIB)
-  message(STATUS "Not building DeviceRTL: Disabled by LIBOMPTARGET_BUILD_DEVICERTL_BCLIB")
-  return()
-endif()
-
-# Check to ensure the host system is a supported host architecture.
-if(NOT ${CMAKE_SIZEOF_VOID_P} EQUAL "8")
-  message(STATUS "Not building DeviceRTL: Runtime does not support 32-bit hosts")
-  return()
-endif()
-
-if (LLVM_DIR)
-  # Builds that use pre-installed LLVM have LLVM_DIR set.
-  # A standalone or LLVM_ENABLE_RUNTIMES=openmp build takes this route
-  find_program(CLANG_TOOL clang PATHS ${LLVM_TOOLS_BINARY_DIR} NO_DEFAULT_PATH)
-elseif (LLVM_TOOL_CLANG_BUILD AND NOT CMAKE_CROSSCOMPILING AND NOT OPENMP_STANDALONE_BUILD)
-  # LLVM in-tree builds may use CMake target names to discover the tools.
-  # A LLVM_ENABLE_PROJECTS=openmp build takes this route
-  set(CLANG_TOOL $<TARGET_FILE:clang>)
-else()
-  message(STATUS "Not building DeviceRTL. No appropriate clang found")
-  return()
-endif()
-
-set(devicertl_base_directory ${CMAKE_CURRENT_SOURCE_DIR})
-set(include_directory ${devicertl_base_directory}/include)
-set(source_directory ${devicertl_base_directory}/src)
-
-set(include_files
-  ${include_directory}/Allocator.h
-  ${include_directory}/Configuration.h
-  ${include_directory}/Debug.h
-  ${include_directory}/Interface.h
-  ${include_directory}/LibC.h
-  ${include_directory}/Mapping.h
-  ${include_directory}/Profiling.h
-  ${include_directory}/State.h
-  ${include_directory}/Synchronization.h
-  ${include_directory}/DeviceTypes.h
-  ${include_directory}/DeviceUtils.h
-  ${include_directory}/Workshare.h
-)
-
-set(src_files
-  ${source_directory}/Allocator.cpp
-  ${source_directory}/Configuration.cpp
-  ${source_directory}/Debug.cpp
-  ${source_directory}/Kernel.cpp
-  ${source_directory}/LibC.cpp
-  ${source_directory}/Mapping.cpp
-  ${source_directory}/Misc.cpp
-  ${source_directory}/Parallelism.cpp
-  ${source_directory}/Profiling.cpp
-  ${source_directory}/Reduction.cpp
-  ${source_directory}/State.cpp
-  ${source_directory}/Synchronization.cpp
-  ${source_directory}/Tasking.cpp
-  ${source_directory}/DeviceUtils.cpp
-  ${source_directory}/Workshare.cpp
-)
-
-# We disable the slp vectorizer during the runtime optimization to avoid
-# vectorized accesses to the shared state. Generally, those are "good" but
-# the optimizer pipeline (esp. Attributor) does not fully support vectorized
-# instructions yet and we end up missing out on way more important constant
-# propagation. That said, we will run the vectorizer again after the runtime
-# has been linked into the user program.
-set(clang_opt_flags -O3 -mllvm -openmp-opt-disable -DSHARED_SCRATCHPAD_SIZE=512 -mllvm -vectorize-slp=false )
-
-# If the user built with the GPU C library enabled we will use that instead.
-if(${LIBOMPTARGET_GPU_LIBC_SUPPORT})
-  list(APPEND clang_opt_flags -DOMPTARGET_HAS_LIBC)
-endif()
-
-# Set flags for LLVM Bitcode compilation.
-set(bc_flags -c -flto -std=c++17 -fvisibility=hidden
-             ${clang_opt_flags} -nogpulib -nostdlibinc
-             -fno-rtti -fno-exceptions -fconvergent-functions
-             -Wno-unknown-cuda-version
-             -DOMPTARGET_DEVICE_RUNTIME
-             -I${include_directory}
-             -I${devicertl_base_directory}/../include
-             -I${devicertl_base_directory}/../../libc
-)
-
-# first create an object target
-function(compileDeviceRTLLibrary target_name target_triple)
-  set(target_bc_flags ${ARGN})
-
-  foreach(src ${src_files})
-    get_filename_component(infile ${src} ABSOLUTE)
-    get_filename_component(outfile ${src} NAME)
-    set(outfile "${outfile}-${target_name}.o")
-    set(depfile "${outfile}.d")
-
-    # Passing an empty CPU to -march= suppressed target specific metadata.
-    add_custom_command(OUTPUT ${outfile}
-      COMMAND ${CLANG_TOOL}
-      ${bc_flags}
-      --target=${target_triple}
-      ${target_bc_flags}
-      -MD -MF ${depfile}
-      ${infile} -o ${outfile}
-      DEPENDS ${infile}
-      DEPFILE ${depfile}
-      COMMENT "Building LLVM bitcode ${outfile}"
-      VERBATIM
-    )
-    if(TARGET clang)
-      # Add a file-level dependency to ensure that clang is up-to-date.
-      # By default, add_custom_command only builds clang if the
-      # executable is missing.
-      add_custom_command(OUTPUT ${outfile}
-        DEPENDS clang
-        APPEND
-      )
-    endif()
-    set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES ${outfile})
-
-    list(APPEND obj_files ${CMAKE_CURRENT_BINARY_DIR}/${outfile})
-  endforeach()
-  # Trick to combine these into a bitcode file via the linker's LTO pass. This
-  # is used to provide the legacy `libomptarget-<name>.bc` files. Hack this
-  # through as an executable to get it to use the relocatable link.
-  add_executable(libomptarget-${target_name} ${obj_files})
-  set_target_properties(libomptarget-${target_name} PROPERTIES
-    RUNTIME_OUTPUT_DIRECTORY ${LIBOMPTARGET_LLVM_LIBRARY_INTDIR}
-    LINKER_LANGUAGE CXX
-    BUILD_RPATH ""
-    INSTALL_RPATH ""
-    RUNTIME_OUTPUT_NAME libomptarget-${target_name}.bc)
-  target_compile_options(libomptarget-${target_name} PRIVATE
-    "--target=${target_triple}" "-fuse-ld=lld" "-march=" "-mcpu="
-    "-Wno-unused-command-line-argument")
-  target_link_options(libomptarget-${target_name} PRIVATE "--target=${target_triple}"
-                      "-r" "-nostdlib" "-flto" "-Wl,--lto-emit-llvm"
-                      "-fuse-ld=lld" "-march=" "-mcpu=")
-  install(TARGETS libomptarget-${target_name}
-          PERMISSIONS OWNER_WRITE OWNER_READ GROUP_READ WORLD_READ
-          DESTINATION "lib${LLVM_LIBDIR_SUFFIX}/${target_triple}")
-
-  add_library(omptarget.${target_name}.all_objs OBJECT IMPORTED)
-  set_property(TARGET omptarget.${target_name}.all_objs APPEND PROPERTY IMPORTED_OBJECTS
-               ${LIBOMPTARGET_LLVM_LIBRARY_INTDIR}/libomptarget-${target_name}.bc)
-  add_dependencies(omptarget.${target_name}.all_objs libomptarget-${target_name})
-
-  # Archive all the object files generated above into a static library
-  add_library(omptarget.${target_name} STATIC)
-  set_target_properties(omptarget.${target_name} PROPERTIES
-    ARCHIVE_OUTPUT_DIRECTORY "${LIBOMPTARGET_LLVM_LIBRARY_INTDIR}/${target_triple}"
-    ARCHIVE_OUTPUT_NAME ompdevice
-    LINKER_LANGUAGE CXX
-  )
-  target_link_libraries(omptarget.${target_name} PRIVATE omptarget.${target_name}.all_objs)
-  target_link_options(omptarget.${target_name} PRIVATE "--target=${target_triple}"
-                      "-Wno-unused-command-line-argument" "-r" "-nostdlib" "-flto"
-                       "-Wl,--lto-emit-llvm" "-fuse-ld=lld" "-march=" "-mcpu=")
-
-  install(TARGETS omptarget.${target_name}
-          ARCHIVE DESTINATION "lib${LLVM_LIBDIR_SUFFIX}/${target_triple}")
-
-  if (CMAKE_EXPORT_COMPILE_COMMANDS)
-    set(ide_target_name omptarget-ide-${target_name})
-    add_library(${ide_target_name} STATIC EXCLUDE_FROM_ALL ${src_files})
-    target_compile_options(${ide_target_name} PRIVATE
-      -fvisibility=hidden --target=${target_triple}
-      -nogpulib -nostdlibinc -Wno-unknown-cuda-version
-    )
-    target_compile_definitions(${ide_target_name} PRIVATE SHARED_SCRATCHPAD_SIZE=512)
-    target_include_directories(${ide_target_name} PRIVATE
-      ${include_directory}
-      ${devicertl_base_directory}/../../libc
-      ${devicertl_base_directory}/../include
-    )
-    install(TARGETS ${ide_target_name} EXCLUDE_FROM_ALL)
-  endif()
-endfunction()
-
-if(NOT LLVM_TARGETS_TO_BUILD OR "AMDGPU" IN_LIST LLVM_TARGETS_TO_BUILD)
-  compileDeviceRTLLibrary(amdgpu amdgcn-amd-amdhsa -Xclang -mcode-object-version=none)
-endif()
-
-if(NOT LLVM_TARGETS_TO_BUILD OR "NVPTX" IN_LIST LLVM_TARGETS_TO_BUILD)
-  compileDeviceRTLLibrary(nvptx nvptx64-nvidia-cuda --cuda-feature=+ptx63)
-endif()
diff --git a/offload/liboffload/src/OffloadImpl.cpp b/offload/liboffload/src/OffloadImpl.cpp
index 08a2e25..051882d 100644
--- a/offload/liboffload/src/OffloadImpl.cpp
+++ b/offload/liboffload/src/OffloadImpl.cpp
@@ -39,12 +39,28 @@ using namespace llvm::omp::target;
 using namespace llvm::omp::target::plugin;
 using namespace error;
 
+struct ol_platform_impl_t {
+  ol_platform_impl_t(std::unique_ptr<GenericPluginTy> Plugin,
+                     ol_platform_backend_t BackendType)
+      : Plugin(std::move(Plugin)), BackendType(BackendType) {}
+  std::unique_ptr<GenericPluginTy> Plugin;
+  llvm::SmallVector<std::unique_ptr<ol_device_impl_t>> Devices;
+  ol_platform_backend_t BackendType;
+
+  /// Complete all pending work for this platform and perform any needed
+  /// cleanup.
+  ///
+  /// After calling this function, no liboffload functions should be called with
+  /// this platform handle.
+  llvm::Error destroy();
+};
+
 // Handle type definitions. Ideally these would be 1:1 with the plugins, but
 // we add some additional data here for now to avoid churn in the plugin
 // interface.
 struct ol_device_impl_t {
   ol_device_impl_t(int DeviceNum, GenericDeviceTy *Device,
-                   ol_platform_handle_t Platform, InfoTreeNode &&DevInfo)
+                   ol_platform_impl_t &Platform, InfoTreeNode &&DevInfo)
       : DeviceNum(DeviceNum), Device(Device), Platform(Platform),
         Info(std::forward<InfoTreeNode>(DevInfo)) {}
 
@@ -55,7 +71,7 @@ struct ol_device_impl_t {
 
   int DeviceNum;
   GenericDeviceTy *Device;
-  ol_platform_handle_t Platform;
+  ol_platform_impl_t &Platform;
   InfoTreeNode Info;
 
   llvm::SmallVector<__tgt_async_info *> OutstandingQueues;
@@ -102,31 +118,17 @@ struct ol_device_impl_t {
   }
 };
 
-struct ol_platform_impl_t {
-  ol_platform_impl_t(std::unique_ptr<GenericPluginTy> Plugin,
-                     ol_platform_backend_t BackendType)
-      : Plugin(std::move(Plugin)), BackendType(BackendType) {}
-  std::unique_ptr<GenericPluginTy> Plugin;
-  llvm::SmallVector<std::unique_ptr<ol_device_impl_t>> Devices;
-  ol_platform_backend_t BackendType;
-
-  /// Complete all pending work for this platform and perform any needed
-  /// cleanup.
-  ///
-  /// After calling this function, no liboffload functions should be called with
-  /// this platform handle.
-  llvm::Error destroy() {
-    llvm::Error Result = Plugin::success();
-    for (auto &D : Devices)
-      if (auto Err = D->destroy())
-        Result = llvm::joinErrors(std::move(Result), std::move(Err));
+llvm::Error ol_platform_impl_t::destroy() {
+  llvm::Error Result = Plugin::success();
+  for (auto &D : Devices)
+    if (auto Err = D->destroy())
+      Result = llvm::joinErrors(std::move(Result), std::move(Err));
 
-    if (auto Res = Plugin->deinit())
-      Result = llvm::joinErrors(std::move(Result), std::move(Res));
+  if (auto Res = Plugin->deinit())
+    Result = llvm::joinErrors(std::move(Result), std::move(Res));
 
-    return Result;
-  }
-};
+  return Result;
+}
 
 struct ol_queue_impl_t {
   ol_queue_impl_t(__tgt_async_info *AsyncInfo, ol_device_handle_t Device)
@@ -206,12 +208,12 @@ struct OffloadContext {
   // Partitioned list of memory base addresses. Each element in this list is a
   // key in AllocInfoMap
   llvm::SmallVector<void *> AllocBases{};
-  SmallVector<ol_platform_impl_t, 4> Platforms{};
+  SmallVector<std::unique_ptr<ol_platform_impl_t>, 4> Platforms{};
   size_t RefCount;
 
   ol_device_handle_t HostDevice() {
     // The host platform is always inserted last
-    return Platforms.back().Devices[0].get();
+    return Platforms.back()->Devices[0].get();
   }
 
   static OffloadContext &get() {
@@ -251,35 +253,34 @@ Error initPlugins(OffloadContext &Context) {
 #define PLUGIN_TARGET(Name)                                                    \
   do {                                                                         \
     if (StringRef(#Name) != "host")                                            \
-      Context.Platforms.emplace_back(ol_platform_impl_t{                       \
+      Context.Platforms.emplace_back(std::make_unique<ol_platform_impl_t>(     \
           std::unique_ptr<GenericPluginTy>(createPlugin_##Name()),             \
-          pluginNameToBackend(#Name)});                                        \
+          pluginNameToBackend(#Name)));                                        \
   } while (false);
 #include "Shared/Targets.def"
 
   // Preemptively initialize all devices in the plugin
   for (auto &Platform : Context.Platforms) {
-    auto Err = Platform.Plugin->init();
+    auto Err = Platform->Plugin->init();
     [[maybe_unused]] std::string InfoMsg = toString(std::move(Err));
-    for (auto DevNum = 0; DevNum < Platform.Plugin->number_of_devices();
+    for (auto DevNum = 0; DevNum < Platform->Plugin->number_of_devices();
          DevNum++) {
-      if (Platform.Plugin->init_device(DevNum) == OFFLOAD_SUCCESS) {
-        auto Device = &Platform.Plugin->getDevice(DevNum);
+      if (Platform->Plugin->init_device(DevNum) == OFFLOAD_SUCCESS) {
+        auto Device = &Platform->Plugin->getDevice(DevNum);
         auto Info = Device->obtainInfoImpl();
         if (auto Err = Info.takeError())
           return Err;
-        Platform.Devices.emplace_back(std::make_unique<ol_device_impl_t>(
-            DevNum, Device, &Platform, std::move(*Info)));
+        Platform->Devices.emplace_back(std::make_unique<ol_device_impl_t>(
+            DevNum, Device, *Platform, std::move(*Info)));
       }
     }
   }
 
   // Add the special host device
   auto &HostPlatform = Context.Platforms.emplace_back(
-      ol_platform_impl_t{nullptr, OL_PLATFORM_BACKEND_HOST});
-  HostPlatform.Devices.emplace_back(
-      std::make_unique<ol_device_impl_t>(-1, nullptr, nullptr, InfoTreeNode{}));
-  Context.HostDevice()->Platform = &HostPlatform;
+      std::make_unique<ol_platform_impl_t>(nullptr, OL_PLATFORM_BACKEND_HOST));
+  HostPlatform->Devices.emplace_back(std::make_unique<ol_device_impl_t>(
+      -1, nullptr, *HostPlatform, InfoTreeNode{}));
 
   Context.TracingEnabled = std::getenv("OFFLOAD_TRACE");
   Context.ValidationEnabled = !std::getenv("OFFLOAD_DISABLE_VALIDATION");
@@ -316,10 +317,10 @@ Error olShutDown_impl() {
 
   for (auto &P : OldContext->Platforms) {
     // Host plugin is nullptr and has no deinit
-    if (!P.Plugin || !P.Plugin->is_initialized())
+    if (!P->Plugin || !P->Plugin->is_initialized())
       continue;
 
-    if (auto Res = P.destroy())
+    if (auto Res = P->destroy())
       Result = llvm::joinErrors(std::move(Result), std::move(Res));
   }
 
@@ -384,7 +385,7 @@ Error olGetDeviceInfoImplDetail(ol_device_handle_t Device,
   // These are not implemented by the plugin interface
   switch (PropName) {
   case OL_DEVICE_INFO_PLATFORM:
-    return Info.write<void *>(Device->Platform);
+    return Info.write<void *>(&Device->Platform);
 
   case OL_DEVICE_INFO_TYPE:
     return Info.write<ol_device_type_t>(OL_DEVICE_TYPE_GPU);
@@ -517,7 +518,7 @@ Error olGetDeviceInfoImplDetailHost(ol_device_handle_t Device,
 
   switch (PropName) {
   case OL_DEVICE_INFO_PLATFORM:
-    return Info.write<void *>(Device->Platform);
+    return Info.write<void *>(&Device->Platform);
   case OL_DEVICE_INFO_TYPE:
     return Info.write<ol_device_type_t>(OL_DEVICE_TYPE_HOST);
   case OL_DEVICE_INFO_NAME:
@@ -595,7 +596,7 @@ Error olGetDeviceInfoSize_impl(ol_device_handle_t Device,
 
 Error olIterateDevices_impl(ol_device_iterate_cb_t Callback, void *UserData) {
   for (auto &Platform : OffloadContext::get().Platforms) {
-    for (auto &Device : Platform.Devices) {
+    for (auto &Device : Platform->Devices) {
       if (!Callback(Device.get(), UserData)) {
         break;
       }
diff --git a/offload/libomptarget/omptarget.cpp b/offload/libomptarget/omptarget.cpp
index 39286d4..a1950cb 100644
--- a/offload/libomptarget/omptarget.cpp
+++ b/offload/libomptarget/omptarget.cpp
@@ -330,6 +330,54 @@ int targetDataMapper(ident_t *Loc, DeviceTy &Device, void *ArgBase, void *Arg,
   return Rc;
 }
 
+/// Returns a buffer of the requested \p Size, to be used as the source for
+/// `submitData`.
+///
+/// For small buffers (`Size <= sizeof(void*)`), uses \p AsyncInfo's
+/// getVoidPtrLocation().
+/// For larger buffers, creates a dynamic buffer which will be eventually
+/// deleted by \p AsyncInfo's post-processing callback.
+static char *getOrCreateSourceBufferForSubmitData(AsyncInfoTy &AsyncInfo,
+                                                  int64_t Size) {
+  constexpr int64_t VoidPtrSize = sizeof(void *);
+
+  if (Size <= VoidPtrSize) {
+    void *&BufferElement = AsyncInfo.getVoidPtrLocation();
+    return reinterpret_cast<char *>(&BufferElement);
+  }
+
+  // Create a dynamic buffer for larger data and schedule its deletion.
+  char *DataBuffer = new char[Size];
+  AsyncInfo.addPostProcessingFunction([DataBuffer]() {
+    delete[] DataBuffer;
+    return OFFLOAD_SUCCESS;
+  });
+  return DataBuffer;
+}
+
+/// Calculates the target pointee base by applying the host
+/// pointee begin/base delta to the target pointee begin.
+///
+/// ```
+/// TgtPteeBase = TgtPteeBegin - (HstPteeBegin - HstPteeBase)
+/// ```
+static void *calculateTargetPointeeBase(void *HstPteeBase, void *HstPteeBegin,
+                                        void *TgtPteeBegin) {
+  uint64_t Delta = reinterpret_cast<uint64_t>(HstPteeBegin) -
+                   reinterpret_cast<uint64_t>(HstPteeBase);
+  void *TgtPteeBase = reinterpret_cast<void *>(
+      reinterpret_cast<uint64_t>(TgtPteeBegin) - Delta);
+
+  DP("HstPteeBase: " DPxMOD ", HstPteeBegin: " DPxMOD
+     ", Delta (HstPteeBegin - HstPteeBase): %" PRIu64 ".\n",
+     DPxPTR(HstPteeBase), DPxPTR(HstPteeBegin), Delta);
+  DP("TgtPteeBase (TgtPteeBegin - Delta): " DPxMOD ", TgtPteeBegin : " DPxMOD
+     "\n",
+     DPxPTR(TgtPteeBase), DPxPTR(TgtPteeBegin));
+
+  return TgtPteeBase;
+}
+
 /// Utility function to perform a pointer attachment operation.
 ///
 /// For something like:
@@ -399,16 +447,8 @@ static int performPointerAttachment(DeviceTy &Device, AsyncInfoTy &AsyncInfo,
   constexpr int64_t VoidPtrSize = sizeof(void *);
   assert(HstPtrSize >= VoidPtrSize && "PointerSize is too small");
 
-  uint64_t Delta = reinterpret_cast<uint64_t>(HstPteeBegin) -
-                   reinterpret_cast<uint64_t>(HstPteeBase);
-  void *TgtPteeBase = reinterpret_cast<void *>(
-      reinterpret_cast<uint64_t>(TgtPteeBegin) - Delta);
-  DP("HstPteeBase: " DPxMOD ", HstPteeBegin: " DPxMOD
-     ", Delta (HstPteeBegin - HstPteeBase): %" PRIu64 ".\n",
-     DPxPTR(HstPteeBase), DPxPTR(HstPteeBegin), Delta);
-  DP("TgtPteeBase (TgtPteeBegin - Delta): " DPxMOD ", TgtPteeBegin : " DPxMOD
-     "\n",
-     DPxPTR(TgtPteeBase), DPxPTR(TgtPteeBegin));
+  void *TgtPteeBase =
+      calculateTargetPointeeBase(HstPteeBase, HstPteeBegin, TgtPteeBegin);
 
   // Add shadow pointer tracking
   if (!PtrTPR.getEntry()->addShadowPointer(
@@ -435,48 +475,32 @@ static int performPointerAttachment(DeviceTy &Device, AsyncInfoTy &AsyncInfo,
     return OFFLOAD_SUCCESS;
   };
 
-  bool IsPtrAFortranDescriptor = HstPtrSize > VoidPtrSize;
-  if (!IsPtrAFortranDescriptor) {
-    // For "regular" pointers, we can use the VoidPtrLocation from AsyncInfo as
-    // the buffer space for the submission.
-    void *&BufferElement = AsyncInfo.getVoidPtrLocation();
-    BufferElement = TgtPteeBase;
-
-    // Submit the updated pointer value to device
-    return HandleSubmitResult(Device.submitData(
-        TgtPtrAddr, &BufferElement, VoidPtrSize, AsyncInfo, PtrTPR.getEntry()));
+  // Get a buffer to be used as the source for data submission.
+  char *SrcBuffer = getOrCreateSourceBufferForSubmitData(AsyncInfo, HstPtrSize);
+
+  // The pointee's address should occupy the first VoidPtrSize bytes
+  // irrespective of HstPtrSize.
+  std::memcpy(SrcBuffer, &TgtPteeBase, VoidPtrSize);
+
+  // For larger "pointers" (e.g., Fortran descriptors), copy remaining
+  // descriptor fields from the host descriptor into the buffer.
+  if (HstPtrSize > VoidPtrSize) {
+    uint64_t HstDescriptorFieldsSize = HstPtrSize - VoidPtrSize;
+    void *HstDescriptorFieldsAddr =
+        reinterpret_cast<char *>(HstPtrAddr) + VoidPtrSize;
+    std::memcpy(SrcBuffer + VoidPtrSize, HstDescriptorFieldsAddr,
+                HstDescriptorFieldsSize);
+
+    DP("Updating %" PRId64 " bytes of descriptor (" DPxMOD
+       ") (pointer + %" PRId64 " additional bytes from host descriptor " DPxMOD
+       ")\n",
+       HstPtrSize, DPxPTR(TgtPtrAddr), HstDescriptorFieldsSize,
+       DPxPTR(HstDescriptorFieldsAddr));
   }
 
-  // For larger "pointers" (like Fortran's descriptors), we create a dynamic
-  // buffer, which will be eventually destroyed by AsyncInfo's post-processing
-  // callback.
-  char *DataBuffer = new char[HstPtrSize];
-
-  // For such descriptors, to the first VoidPtrSize bytes, we store the
-  // pointee's device address.
-  std::memcpy(DataBuffer, &TgtPteeBase, sizeof(void *));
-
-  // And to the remaining bytes, we copy the remaining contents of the host
-  // descriptor after the initial VoidPtrSize bytes.
-  uint64_t HstDescriptorFieldsSize = HstPtrSize - VoidPtrSize;
-  void *HstDescriptorFieldsAddr =
-      reinterpret_cast<char *>(HstPtrAddr) + VoidPtrSize;
-  std::memcpy(DataBuffer + VoidPtrSize, HstDescriptorFieldsAddr,
-              HstDescriptorFieldsSize);
-
-  DP("Updating %" PRId64 " bytes of descriptor (" DPxMOD ") (pointer + %" PRId64
-     " additional bytes from host descriptor " DPxMOD ")\n",
-     HstPtrSize, DPxPTR(TgtPtrAddr), HstDescriptorFieldsSize,
-     DPxPTR(HstDescriptorFieldsAddr));
-
-  // Submit the entire buffer to device
-  int SubmitResult = Device.submitData(TgtPtrAddr, DataBuffer, HstPtrSize,
+  // Submit the populated source buffer to device.
+  int SubmitResult = Device.submitData(TgtPtrAddr, SrcBuffer, HstPtrSize,
                                        AsyncInfo, PtrTPR.getEntry());
-
-  AsyncInfo.addPostProcessingFunction([DataBuffer]() -> int {
-    delete[] DataBuffer;
-    return OFFLOAD_SUCCESS;
-  });
   return HandleSubmitResult(SubmitResult);
 }
 
@@ -525,10 +549,17 @@ int targetDataBegin(ident_t *Loc, DeviceTy &Device, int32_t ArgNum,
     // ATTACH map-types are supposed to be handled after all mapping for the
     // construct is done. Defer their processing.
     if (ArgTypes[I] & OMP_TGT_MAPTYPE_ATTACH) {
-      AttachInfo->AttachEntries.emplace_back(
-          /*PointerBase=*/HstPtrBase, /*PointeeBegin=*/HstPtrBegin,
-          /*PointerSize=*/DataSize, /*MapType=*/ArgTypes[I],
-          /*PointeeName=*/HstPtrName);
+      const bool IsCorrespondingPointerInit =
+          (ArgTypes[I] & OMP_TGT_MAPTYPE_PRIVATE);
+      // We don't need to keep track of PRIVATE | ATTACH entries. They
+      // represent corresponding-pointer-initialization, and are handled
+      // similar to firstprivate (PRIVATE | TO) entries by
+      // PrivateArgumentManager.
+      if (!IsCorrespondingPointerInit)
+        AttachInfo->AttachEntries.emplace_back(
+            /*PointerBase=*/HstPtrBase, /*PointeeBegin=*/HstPtrBegin,
+            /*PointerSize=*/DataSize, /*MapType=*/ArgTypes[I],
+            /*PointeeName=*/HstPtrName);
 
       DP("Deferring ATTACH map-type processing for argument %d\n", I);
       continue;
@@ -1397,13 +1428,24 @@ class PrivateArgumentManagerTy {
     uint32_t Padding;
     /// Host pointer name
     map_var_info_t HstPtrName = nullptr;
+    /// For corresponding-pointer-initialization: host pointee base address.
+    void *HstPteeBase = nullptr;
+    /// For corresponding-pointer-initialization: host pointee begin address.
+    void *HstPteeBegin = nullptr;
+    /// Whether this argument needs corresponding-pointer-initialization.
+    bool IsCorrespondingPointerInit = false;
 
     FirstPrivateArgInfoTy(int Index, void *HstPtr, uint32_t Size,
                           uint32_t Alignment, uint32_t Padding,
-                          map_var_info_t HstPtrName = nullptr)
+                          map_var_info_t HstPtrName = nullptr,
+                          void *HstPteeBase = nullptr,
+                          void *HstPteeBegin = nullptr,
+                          bool IsCorrespondingPointerInit = false)
         : HstPtrBegin(reinterpret_cast<char *>(HstPtr)),
           HstPtrEnd(HstPtrBegin + Size), Index(Index), Alignment(Alignment),
-          Size(Size), Padding(Padding), HstPtrName(HstPtrName) {}
+          Size(Size), Padding(Padding), HstPtrName(HstPtrName),
+          HstPteeBase(HstPteeBase), HstPteeBegin(HstPteeBegin),
+          IsCorrespondingPointerInit(IsCorrespondingPointerInit) {}
   };
 
   /// A vector of target pointers for all private arguments
@@ -1421,6 +1463,153 @@ class PrivateArgumentManagerTy {
   /// A pointer to a \p AsyncInfoTy object
   AsyncInfoTy &AsyncInfo;
 
+  /// \returns the value of the target pointee's base to be used for
+  /// corresponding-pointer-initialization.
+  void *getTargetPointeeBaseForCorrespondingPointerInitialization(
+      void *HstPteeBase, void *HstPteeBegin) {
+    // See if the pointee's begin address has corresponding storage on device.
+    void *TgtPteeBegin = [&]() -> void * {
+      if (!HstPteeBegin) {
+        DP("Corresponding-pointer-initialization: pointee begin address is "
+           "null\n");
+        return nullptr;
+      }
+
+      return Device.getMappingInfo()
+          .getTgtPtrBegin(HstPteeBegin, /*Size=*/0, /*UpdateRefCount=*/false,
+                          /*UseHoldRefCount=*/false)
+          .TargetPointer;
+    }();
+
+    // If it does, we calculate target pointee base using it, and return it.
+    // Otherwise, we retain the host pointee's base as the target pointee base
+    // of the initialized pointer. It's the user's responsibility to ensure
+    // that if a lookup fails, the host pointee is accessible on the device.
+    return TgtPteeBegin ? calculateTargetPointeeBase(HstPteeBase, HstPteeBegin,
+                                                     TgtPteeBegin)
+                        : HstPteeBase;
+  }
+
+  /// Initialize the source buffer for corresponding-pointer-initialization.
+  ///
+  /// It computes and stores the target pointee base address (or the host
+  /// pointee's base address, if lookup of target pointee fails) to the first
+  /// `sizeof(void*)` bytes of \p Buffer, and for larger pointers
+  /// (Fortran descriptors), the remaining fields of the host descriptor
+  /// \p HstPtr after those `sizeof(void*)` bytes.
+  ///
+  /// Corresponding-pointer-initialization represents the initialization of the
+  /// private version of a base-pointer/referring-pointer on a target construct.
+  ///
+  /// For example, for the following test:
+  /// ```cpp
+  ///   int x[10];
+  ///   int *px = &x[0];
+  ///   ...
+  ///   #pragma omp target data map(tofrom:px)
+  ///   {
+  ///     int **ppx = omp_get_mapped_ptr(&px, omp_get_default_device());
+  ///     #pragma omp target map(tofrom:px[1]) is_device_ptr(ppx)
+  ///     {
+  ///        foo(px, ppx);
+  ///     }
+  ///   }
+  /// ```
+  /// The following shows a possible way to implement the mapping of `px`,
+  /// which is pre-determined firstprivate and should get initialized
+  /// via corresponding-pointer-initialization:
+  ///
+  /// (A) Possible way to implement the above with PRIVATE | ATTACH:
+  /// ```llvm
+  ///  ; maps for px:
+  ///  ; &px[0], &px[1], sizeof(px[1]), TO | FROM                // (1)
+  ///  ; &px,    &px[1], sizeof(px),    ATTACH                   // (2)
+  ///  ; &px,    &px[1], sizeof(px),    PRIVATE | ATTACH | PARAM // (3)
+  ///  call... @__omp_outlined...(ptr %px, ptr %ppx)
+  ///  define ... @__omp_outlined(ptr %px, ptr %ppx) {...
+  ///    foo(%px, %ppx)
+  ///  ...}
+  /// ```
+  /// `(1)` maps the pointee `px[1].
+  /// `(2)` attaches it to the mapped version of `px`. It can be controlled by
+  /// the user based on the `attach(auto/always/never)` map-type modifier.
+  /// `(3)` privatizes and initializes the private pointer `px`, and passes it
+  /// into the kernel as the argument `%px`. Can be skipped if `px` is not
+  /// referenced in the target construct.
+  ///
+  /// While this method is not too beneficial compared to just doing the
+  /// initialization in the body of the kernel, like:
+  /// (B) Possible way to implement the above without PRIVATE | ATTACH:
+  /// ```llvm
+  ///  ; maps for px:
+  ///  ; &px[0], &px[1], sizeof(px[1]), TO | FROM | PARAM        // (4)
+  ///  ; &px,    &px[1], sizeof(px),    ATTACH                   // (5)
+  ///  call... @__omp_outlined...(ptr %px0, ptr %ppx)
+  ///  define ... __omp_outlined...(ptr %px0, ptr %ppx) {
+  ///    %px = alloca ptr;
+  ///    store ptr %px0, ptr %px
+  ///    foo(%px, %ppx)
+  ///  }
+  /// ```
+  ///
+  /// (B) is not so convenient for Fortran descriptors, because in
+  /// addition to the lookup, the remaining fields of the descriptor have
+  /// to be passed into the kernel to initialize the private copy, which
+  /// makes (A) a cleaner option for them. e.g.
+  /// ```f90
+  /// integer, pointer :: p(:)
+  /// !$omp target map(p(1))
+  /// ```
+  ///
+  /// (C) Possible mapping for the above Fortran test using PRIVATE | ATTACH:
+  /// ```llvm
+  ///  ; maps for p:
+  ///  ; &p(1),       &p(1), sizeof(p(1)),       TO | FROM
+  ///  ; &ref_ptr(p), &p(1), sizeof(ref_ptr(p)), ATTACH
+  ///  ; &ref_ptr(p), &p(1), sizeof(ref_ptr(p)), PRIVATE | ATTACH | PARAM
+  ///  call... @__omp_outlined...(ptr %ref_ptr_of_p)
+  void initBufferForCorrespondingPointerInitialization(char *Buffer,
+                                                       void *HstPtr,
+                                                       int64_t HstPtrSize,
+                                                       void *HstPteeBase,
+                                                       void *HstPteeBegin) {
+    constexpr int64_t VoidPtrSize = sizeof(void *);
+    assert(HstPtrSize >= VoidPtrSize &&
+           "corresponding-pointer-initialization: pointer size is too small");
+
+    void *TgtPteeBase =
+        getTargetPointeeBaseForCorrespondingPointerInitialization(HstPteeBase,
+                                                                  HstPteeBegin);
+
+    // Store the target pointee base address to the first VoidPtrSize bytes
+    DP("Initializing corresponding-pointer-initialization source buffer "
+       "for " DPxMOD ", with pointee base " DPxMOD "\n",
+       DPxPTR(HstPtr), DPxPTR(TgtPteeBase));
+    std::memcpy(Buffer, &TgtPteeBase, VoidPtrSize);
+    if (HstPtrSize <= VoidPtrSize)
+      return;
+
+    // For Fortran descriptors, copy the remaining descriptor fields from host
+    uint64_t HstDescriptorFieldsSize = HstPtrSize - VoidPtrSize;
+    void *HstDescriptorFieldsAddr = static_cast<char *>(HstPtr) + VoidPtrSize;
+    DP("Copying %" PRId64
+       " bytes of descriptor fields into corresponding-pointer-initialization "
+       "buffer at offset %" PRId64 ", from " DPxMOD "\n",
+       HstDescriptorFieldsSize, VoidPtrSize, DPxPTR(HstDescriptorFieldsAddr));
+    std::memcpy(Buffer + VoidPtrSize, HstDescriptorFieldsAddr,
+                HstDescriptorFieldsSize);
+  }
+
+  /// Helper function to create and initialize a buffer to be used as the source
+  /// for corresponding-pointer-initialization.
+  void *createAndInitSourceBufferForCorrespondingPointerInitialization(
+      void *HstPtr, int64_t HstPtrSize, void *HstPteeBase, void *HstPteeBegin) {
+    char *Buffer = getOrCreateSourceBufferForSubmitData(AsyncInfo, HstPtrSize);
+    initBufferForCorrespondingPointerInitialization(Buffer, HstPtr, HstPtrSize,
+                                                    HstPteeBase, HstPteeBegin);
+    return Buffer;
+  }
+
   // TODO: What would be the best value here? Should we make it configurable?
   // If the size is larger than this threshold, we will allocate and transfer it
   // immediately instead of packing it.
@@ -1435,7 +1624,9 @@ public:
   int addArg(void *HstPtr, int64_t ArgSize, int64_t ArgOffset,
              bool IsFirstPrivate, void *&TgtPtr, int TgtArgsIndex,
              map_var_info_t HstPtrName = nullptr,
-             const bool AllocImmediately = false) {
+             const bool AllocImmediately = false, void *HstPteeBase = nullptr,
+             void *HstPteeBegin = nullptr,
+             bool IsCorrespondingPointerInit = false) {
     // If the argument is not first-private, or its size is greater than a
     // predefined threshold, we will allocate memory and issue the transfer
     // immediately.
@@ -1458,9 +1649,19 @@ public:
       // If first-private, copy data from host
       if (IsFirstPrivate) {
         DP("Submitting firstprivate data to the device.\n");
-        int Ret = Device.submitData(TgtPtr, HstPtr, ArgSize, AsyncInfo);
+
+        // The source value used for corresponding-pointer-initialization
+        // is different vs regular firstprivates.
+        void *DataSource =
+            IsCorrespondingPointerInit
+                ? createAndInitSourceBufferForCorrespondingPointerInitialization(
+                      HstPtr, ArgSize, HstPteeBase, HstPteeBegin)
+                : HstPtr;
+        int Ret = Device.submitData(TgtPtr, DataSource, ArgSize, AsyncInfo);
         if (Ret != OFFLOAD_SUCCESS) {
-          DP("Copying data to device failed, failed.\n");
+          DP("Copying %s data to device failed.\n",
+             IsCorrespondingPointerInit ? "corresponding-pointer-initialization"
+                                        : "firstprivate");
           return OFFLOAD_FAIL;
         }
       }
@@ -1506,8 +1707,10 @@ public:
         }
       }
 
-      FirstPrivateArgInfo.emplace_back(TgtArgsIndex, HstPtr, ArgSize,
-                                       StartAlignment, Padding, HstPtrName);
+      FirstPrivateArgInfo.emplace_back(
+          TgtArgsIndex, HstPtr, ArgSize, StartAlignment, Padding, HstPtrName,
+          HstPteeBase, HstPteeBegin, IsCorrespondingPointerInit);
+
       FirstPrivateArgSize += Padding + ArgSize;
     }
 
@@ -1526,7 +1729,13 @@ public:
       for (FirstPrivateArgInfoTy &Info : FirstPrivateArgInfo) {
         // First pad the pointer as we (have to) pad it on the device too.
         Itr = std::next(Itr, Info.Padding);
-        std::copy(Info.HstPtrBegin, Info.HstPtrEnd, Itr);
+
+        if (Info.IsCorrespondingPointerInit)
+          initBufferForCorrespondingPointerInitialization(
+              &*Itr, Info.HstPtrBegin, Info.Size, Info.HstPteeBase,
+              Info.HstPteeBegin);
+        else
+          std::copy(Info.HstPtrBegin, Info.HstPtrEnd, Itr);
         Itr = std::next(Itr, Info.Size);
       }
       // Allocate target memory
@@ -1682,8 +1891,40 @@ static int processDataBefore(ident_t *Loc, int64_t DeviceId, void *HostPtr,
       TgtPtrBegin = HstPtrBase;
       TgtBaseOffset = 0;
     } else if (ArgTypes[I] & OMP_TGT_MAPTYPE_PRIVATE) {
+      // For cases like:
+      // ```
+      // int *p = ...;
+      // #pragma omp target map(p[0:10])
+      // ```
+      // `p` is predetermined firstprivate on the target construct, and the
+      // method to determine the initial value of the private copy on the
+      // device is called "corresponding-pointer-initialization".
+      //
+      // Such firstprivate pointers that need
+      // corresponding-pointer-initialization are represented using the
+      // `PRIVATE | ATTACH` map-types, in contrast to regular firstprivate
+      // entries, which use `PRIVATE | TO`. The structure of these
+      // `PRIVATE | ATTACH` entries is the same as the non-private
+      // `ATTACH` entries used to represent pointer-attachments, i.e.:
+      // ```
+      //  &hst_ptr_base/begin, &hst_ptee_begin, sizeof(hst_ptr)
+      // ```
+      const bool IsAttach = (ArgTypes[I] & OMP_TGT_MAPTYPE_ATTACH);
+      void *HstPteeBase = nullptr;
+      void *HstPteeBegin = nullptr;
+      if (IsAttach) {
+        // For corresponding-pointer-initialization, Args[I] is HstPteeBegin,
+        // and ArgBases[I] is both HstPtrBase/HstPtrBegin.
+        HstPteeBase = *reinterpret_cast<void **>(HstPtrBase);
+        HstPteeBegin = Args[I];
+        HstPtrBegin = ArgBases[I];
+      }
       TgtBaseOffset = (intptr_t)HstPtrBase - (intptr_t)HstPtrBegin;
-      const bool IsFirstPrivate = (ArgTypes[I] & OMP_TGT_MAPTYPE_TO);
+      // Corresponding-pointer-initialization is a special case of firstprivate,
+      // since it also involves initializing the private pointer.
+      const bool IsFirstPrivate =
+          (ArgTypes[I] & OMP_TGT_MAPTYPE_TO) || IsAttach;
+
       // If there is a next argument and it depends on the current one, we need
       // to allocate the private memory immediately. If this is not the case,
       // then the argument can be marked for optimization and packed with the
@@ -1692,9 +1933,11 @@ static int processDataBefore(ident_t *Loc, int64_t DeviceId, void *HostPtr,
           (I < ArgNum - 1 && (ArgTypes[I + 1] & OMP_TGT_MAPTYPE_MEMBER_OF));
       Ret = PrivateArgumentManager.addArg(
           HstPtrBegin, ArgSizes[I], TgtBaseOffset, IsFirstPrivate, TgtPtrBegin,
-          TgtArgs.size(), HstPtrName, AllocImmediately);
+          /*TgtArgsIndex=*/TgtArgs.size(), HstPtrName, AllocImmediately,
+          HstPteeBase, HstPteeBegin, /*IsCorrespondingPointerInit=*/IsAttach);
       if (Ret != OFFLOAD_SUCCESS) {
-        REPORT("Failed to process %sprivate argument " DPxMOD "\n",
+        REPORT("Failed to process %s%sprivate argument " DPxMOD "\n",
+               IsAttach ? "corresponding-pointer-initialization " : "",
                (IsFirstPrivate ? "first-" : ""), DPxPTR(HstPtrBegin));
         return OFFLOAD_FAIL;
       }
diff --git a/offload/test/mapping/lambda_by_value.cpp b/offload/test/mapping/lambda_by_value.cpp
index 5516dedd..4c0278d 100644
--- a/offload/test/mapping/lambda_by_value.cpp
+++ b/offload/test/mapping/lambda_by_value.cpp
@@ -1,4 +1,5 @@
-// RUN: %libomptarget-compilexx-run-and-check-generic
+// RUN: %libomptarget-compileopt-generic -fno-exceptions
+// RUN: %libomptarget-run-generic 2>&1 | %fcheck-generic
 
 #include <stdint.h>
 #include <stdio.h>
diff --git a/offload/test/mapping/map_back_race.cpp b/offload/test/mapping/map_back_race.cpp
index 8a988d3..49bbe87 100644
--- a/offload/test/mapping/map_back_race.cpp
+++ b/offload/test/mapping/map_back_race.cpp
@@ -2,6 +2,9 @@
 
 // Taken from https://github.com/llvm/llvm-project/issues/54216
 
+// FIXME: https://github.com/llvm/llvm-project/issues/161265
+// UNSUPPORTED: gpu
+
 #include <algorithm>
 #include <cstdlib>
 #include <iostream>
diff --git a/offload/test/mapping/map_both_pointer_pointee.c b/offload/test/mapping/map_both_pointer_pointee.c
index 7be1ba4..1934b70 100644
--- a/offload/test/mapping/map_both_pointer_pointee.c
+++ b/offload/test/mapping/map_both_pointer_pointee.c
@@ -1,11 +1,10 @@
-// RUN: %libomptarget-compile-run-and-check-aarch64-unknown-linux-gnu
-// RUN: %libomptarget-compile-run-and-check-powerpc64-ibm-linux-gnu
-// RUN: %libomptarget-compile-run-and-check-powerpc64le-ibm-linux-gnu
-// RUN: %libomptarget-compile-run-and-check-x86_64-unknown-linux-gnu
-// RUN: %libomptarget-compile-run-and-check-nvptx64-nvidia-cuda
+// RUN: %libomptarget-compile-run-and-check-generic
 
 // REQUIRES: unified_shared_memory
 // UNSUPPORTED: amdgcn-amd-amdhsa
+//
+// FIXME: https://github.com/llvm/llvm-project/issues/161265
+// XFAIL: nvidiagpu
 
 #pragma omp declare target
 int *ptr1;
diff --git a/offload/test/mapping/map_ptr_and_star_local.c b/offload/test/mapping/map_ptr_and_star_local.c
index cc826b3..97fa7cd 100644
--- a/offload/test/mapping/map_ptr_and_star_local.c
+++ b/offload/test/mapping/map_ptr_and_star_local.c
@@ -1,6 +1,9 @@
-// RUN: %libomptarget-compilexx-run-and-check-generic
+// RUN: %libomptarget-compile-run-and-check-generic
 
 // REQUIRES: libc
+//
+// FIXME: https://github.com/llvm/llvm-project/issues/161265
+// XFAIL: gpu
 
 #include <omp.h>
 #include <stdio.h>
diff --git a/offload/test/mapping/map_structptr_and_member_global.c b/offload/test/mapping/map_structptr_and_member_global.c
index 960eea4..f855e87 100644
--- a/offload/test/mapping/map_structptr_and_member_global.c
+++ b/offload/test/mapping/map_structptr_and_member_global.c
@@ -1,6 +1,9 @@
-// RUN: %libomptarget-compilexx-run-and-check-generic
+// RUN: %libomptarget-compile-run-and-check-generic
 
 // REQUIRES: libc
+//
+// FIXME: https://github.com/llvm/llvm-project/issues/161265
+// XFAIL: gpu
 
 #include <omp.h>
 #include <stdio.h>
diff --git a/offload/test/mapping/map_structptr_and_member_local.c b/offload/test/mapping/map_structptr_and_member_local.c
index bd75940..bd9e2a8 100644
--- a/offload/test/mapping/map_structptr_and_member_local.c
+++ b/offload/test/mapping/map_structptr_and_member_local.c
@@ -1,6 +1,9 @@
-// RUN: %libomptarget-compilexx-run-and-check-generic
+// RUN: %libomptarget-compile-run-and-check-generic
 
 // REQUIRES: libc
+//
+// FIXME: https://github.com/llvm/llvm-project/issues/161265
+// XFAIL: gpu
 
 #include <omp.h>
 #include <stdio.h>
diff --git a/offload/test/offloading/CUDA/basic_launch_multi_arg.cu b/offload/test/offloading/CUDA/basic_launch_multi_arg.cu
index 1f84a0e..b2e1edf 100644
--- a/offload/test/offloading/CUDA/basic_launch_multi_arg.cu
+++ b/offload/test/offloading/CUDA/basic_launch_multi_arg.cu
@@ -5,10 +5,10 @@
 // RUN: %t | %fcheck-generic
 // clang-format on
 
-// UNSUPPORTED: aarch64-unknown-linux-gnu
-// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-// UNSUPPORTED: x86_64-unknown-linux-gnu
-// UNSUPPORTED: x86_64-unknown-linux-gnu-LTO
+// REQUIRES: gpu
+//
+// FIXME: https://github.com/llvm/llvm-project/issues/161265
+// XFAIL: gpu
 
 #include <stdio.h>
 
diff --git a/offload/test/offloading/bug51781.c b/offload/test/offloading/bug51781.c
index 2f30b03..ff7fa51 100644
--- a/offload/test/offloading/bug51781.c
+++ b/offload/test/offloading/bug51781.c
@@ -16,6 +16,7 @@
 // the generic state machine.
 //
 // RUN: %libomptarget-compile-generic -O2 -foffload-lto -Rpass=openmp-opt \
+// RUN:   -Xoffload-linker -mllvm=-openmp-opt-disable-spmdization \
 // RUN:   -mllvm -openmp-opt-disable-spmdization > %t.custom 2>&1
 // RUN: %fcheck-nvptx64-nvidia-cuda -check-prefix=CUSTOM -input-file=%t.custom
 // RUN: %fcheck-amdgcn-amd-amdhsa -check-prefix=CUSTOM -input-file=%t.custom
@@ -24,7 +25,9 @@
 // Repeat with reduction clause, which has managed to break the custom state
 // machine in the past.
 //
-// RUN: %libomptarget-compile-generic -O2 -foffload-lto -Rpass=openmp-opt -DADD_REDUCTION \
+// RUN: %libomptarget-compile-generic -O2 -foffload-lto -Rpass=openmp-opt \
+// RUN:   -DADD_REDUCTION \
+// RUN:   -Xoffload-linker -mllvm=-openmp-opt-disable-spmdization \
 // RUN:   -mllvm -openmp-opt-disable-spmdization > %t.custom 2>&1
 // RUN: %fcheck-nvptx64-nvidia-cuda -check-prefix=CUSTOM -input-file=%t.custom
 // RUN: %fcheck-amdgcn-amd-amdhsa -check-prefix=CUSTOM -input-file=%t.custom
diff --git a/offload/test/offloading/fortran/declare-target-automap.f90 b/offload/test/offloading/fortran/declare-target-automap.f90
index b9c2d34..b44c0b2 100644
--- a/offload/test/offloading/fortran/declare-target-automap.f90
+++ b/offload/test/offloading/fortran/declare-target-automap.f90
@@ -1,6 +1,9 @@
 !Offloading test for AUTOMAP modifier in declare target enter
 ! REQUIRES: flang, amdgpu
 
+! FIXME: https://github.com/llvm/llvm-project/issues/161265
+! XFAIL: amdgpu
+
 ! RUN: %libomptarget-compile-fortran-run-and-check-generic
 program automap_program
    use iso_c_binding, only: c_loc
diff --git a/offload/test/offloading/fortran/target-no-loop.f90 b/offload/test/offloading/fortran/target-no-loop.f90
index 8e40e20..3c88b00 100644
--- a/offload/test/offloading/fortran/target-no-loop.f90
+++ b/offload/test/offloading/fortran/target-no-loop.f90
@@ -1,4 +1,5 @@
 ! REQUIRES: flang
+! REQUIRES: gpu
 
 ! RUN: %libomptarget-compile-fortran-generic -O3  -fopenmp-assume-threads-oversubscription -fopenmp-assume-teams-oversubscription
 ! RUN: env LIBOMPTARGET_INFO=16 OMP_NUM_TEAMS=16 OMP_TEAMS_THREAD_LIMIT=16 %libomptarget-run-generic 2>&1 | %fcheck-generic
diff --git a/offload/test/offloading/interop.c b/offload/test/offloading/interop.c
index 26287e3..d9fa2ef 100644
--- a/offload/test/offloading/interop.c
+++ b/offload/test/offloading/interop.c
@@ -1,5 +1,6 @@
 // RUN: %libomptarget-compile-run-and-check-generic
-// REQUIRES: nvptx64-nvidia-cuda
+
+// XFAIL: *
 
 #include <assert.h>
 #include <omp.h>
diff --git a/offload/test/offloading/single_threaded_for_barrier_hang_1.c b/offload/test/offloading/single_threaded_for_barrier_hang_1.c
index 8ee6b51..a007521 100644
--- a/offload/test/offloading/single_threaded_for_barrier_hang_1.c
+++ b/offload/test/offloading/single_threaded_for_barrier_hang_1.c
@@ -1,6 +1,9 @@
 // RUN: %libomptarget-compile-run-and-check-generic
 // RUN: %libomptarget-compileopt-run-and-check-generic
 
+// FIXME: https://github.com/llvm/llvm-project/issues/161265
+// UNSUPPORTED: gpu
+
 #include <omp.h>
 #include <stdio.h>
 
diff --git a/offload/test/offloading/single_threaded_for_barrier_hang_2.c b/offload/test/offloading/single_threaded_for_barrier_hang_2.c
index a98abd6..cabd2ed 100644
--- a/offload/test/offloading/single_threaded_for_barrier_hang_2.c
+++ b/offload/test/offloading/single_threaded_for_barrier_hang_2.c
@@ -1,6 +1,7 @@
 // RUN: %libomptarget-compile-run-and-check-generic
-// FIXME: This fails with optimization enabled and prints b: 0
-// FIXME: RUN: %libomptarget-compileopt-run-and-check-generic
+
+// FIXME: https://github.com/llvm/llvm-project/issues/161265
+// UNSUPPORTED: gpu
 
 #include <omp.h>
 #include <stdio.h>
diff --git a/offload/test/offloading/spmdization.c b/offload/test/offloading/spmdization.c
index 7f3f47d..48627cd 100644
--- a/offload/test/offloading/spmdization.c
+++ b/offload/test/offloading/spmdization.c
@@ -2,7 +2,8 @@
 // RUN: %libomptarget-compileopt-generic
 // RUN: env LIBOMPTARGET_INFO=16 \
 // RUN:   %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK,SPMD
-// RUN: %libomptarget-compileopt-generic -mllvm --openmp-opt-disable-spmdization
+// RUN: %libomptarget-compileopt-generic -mllvm --openmp-opt-disable-spmdization \
+// RUN:   -Xoffload-linker -mllvm=--openmp-opt-disable-spmdization
 // RUN: env LIBOMPTARGET_INFO=16 \
 // RUN:   %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK,GENERIC
 // clang-format on
diff --git a/offload/test/sanitizer/ptr_outside_alloc_1.c b/offload/test/sanitizer/ptr_outside_alloc_1.c
index bdd0283..b30ce12e 100644
--- a/offload/test/sanitizer/ptr_outside_alloc_1.c
+++ b/offload/test/sanitizer/ptr_outside_alloc_1.c
@@ -5,12 +5,10 @@
 // RUN: %not --crash env -u LLVM_DISABLE_SYMBOLIZATION OFFLOAD_TRACK_ALLOCATION_TRACES=1 %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK,TRACE
 // clang-format on
 
-// UNSUPPORTED: aarch64-unknown-linux-gnu
-// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-// UNSUPPORTED: x86_64-unknown-linux-gnu
-// UNSUPPORTED: x86_64-unknown-linux-gnu-LTO
-// UNSUPPORTED: s390x-ibm-linux-gnu
-// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
+// FIXME: https://github.com/llvm/llvm-project/issues/161265
+// UNSUPPORTED: nvidiagpu
+//
+// REQUIRES: gpu
 
 #include <omp.h>
 
diff --git a/offload/test/sanitizer/ptr_outside_alloc_2.c b/offload/test/sanitizer/ptr_outside_alloc_2.c
index 6a67962..3bb8bda 100644
--- a/offload/test/sanitizer/ptr_outside_alloc_2.c
+++ b/offload/test/sanitizer/ptr_outside_alloc_2.c
@@ -3,12 +3,10 @@
 // RUN: %not --crash env -u LLVM_DISABLE_SYMBOLIZATION OFFLOAD_TRACK_ALLOCATION_TRACES=1 %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK
 // clang-format on
 
-// UNSUPPORTED: aarch64-unknown-linux-gnu
-// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-// UNSUPPORTED: x86_64-unknown-linux-gnu
-// UNSUPPORTED: x86_64-unknown-linux-gnu-LTO
-// UNSUPPORTED: s390x-ibm-linux-gnu
-// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
+// FIXME: https://github.com/llvm/llvm-project/issues/161265
+// UNSUPPORTED: nvidiagpu
+//
+// REQUIRES: gpu
 
 #include <omp.h>
 
diff --git a/offload/test/sanitizer/use_after_free_1.c b/offload/test/sanitizer/use_after_free_1.c
index c4783c5c..acc1de3 100644
--- a/offload/test/sanitizer/use_after_free_1.c
+++ b/offload/test/sanitizer/use_after_free_1.c
@@ -5,12 +5,10 @@
 // RUN: %not --crash env -u LLVM_DISABLE_SYMBOLIZATION OFFLOAD_TRACK_ALLOCATION_TRACES=1 %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK,TRACE
 // clang-format on
 
-// UNSUPPORTED: aarch64-unknown-linux-gnu
-// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-// UNSUPPORTED: x86_64-unknown-linux-gnu
-// UNSUPPORTED: x86_64-unknown-linux-gnu-LTO
-// UNSUPPORTED: s390x-ibm-linux-gnu
-// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
+// FIXME: https://github.com/llvm/llvm-project/issues/161265
+// UNSUPPORTED: nvidiagpu
+//
+// REQUIRES: gpu
 
 #include <omp.h>
 
diff --git a/offload/test/sanitizer/use_after_free_2.c b/offload/test/sanitizer/use_after_free_2.c
index 1c1e097..3d70fb7 100644
--- a/offload/test/sanitizer/use_after_free_2.c
+++ b/offload/test/sanitizer/use_after_free_2.c
@@ -3,12 +3,10 @@
 // RUN: %not --crash env -u LLVM_DISABLE_SYMBOLIZATION OFFLOAD_TRACK_ALLOCATION_TRACES=1 %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK
 // clang-format on
 
-// UNSUPPORTED: aarch64-unknown-linux-gnu
-// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-// UNSUPPORTED: x86_64-unknown-linux-gnu
-// UNSUPPORTED: x86_64-unknown-linux-gnu-LTO
-// UNSUPPORTED: s390x-ibm-linux-gnu
-// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
+// FIXME: https://github.com/llvm/llvm-project/issues/161265
+// UNSUPPORTED: nvidiagpu
+//
+// REQUIRES: gpu
 
 // If offload memory pooling is enabled for a large allocation, reuse error is
 // not detected. UNSUPPORTED: large_allocation_memory_pool
diff --git a/offload/tools/deviceinfo/llvm-offload-device-info.cpp b/offload/tools/deviceinfo/llvm-offload-device-info.cpp
index 67a6e07..9b58d67 100644
--- a/offload/tools/deviceinfo/llvm-offload-device-info.cpp
+++ b/offload/tools/deviceinfo/llvm-offload-device-info.cpp
@@ -137,7 +137,7 @@ ol_result_t printDeviceValue(std::ostream &S, ol_device_handle_t Dev,
     size_t Size;
     OFFLOAD_ERR(olGetDeviceInfoSize(Dev, Info, &Size));
     Val.resize(Size);
-    OFFLOAD_ERR(olGetDeviceInfo(Dev, Info, sizeof(Val), Val.data()));
+    OFFLOAD_ERR(olGetDeviceInfo(Dev, Info, Size, Val.data()));
     doWrite<T, PK>(S, reinterpret_cast<T>(Val.data()));
   } else {
     T Val;
diff --git a/openmp/device/CMakeLists.txt b/openmp/device/CMakeLists.txt
index ded961a..54cfdfe 100644
--- a/openmp/device/CMakeLists.txt
+++ b/openmp/device/CMakeLists.txt
@@ -64,7 +64,7 @@ set_target_properties(libompdevice PROPERTIES
   RUNTIME_OUTPUT_NAME libomptarget-${target_name}.bc)
 
 # If the user built with the GPU C library enabled we will use that instead.
-if(LIBOMPTARGET_GPU_LIBC_SUPPORT)
+if(TARGET libc)
   target_compile_definitions(libompdevice PRIVATE OMPTARGET_HAS_LIBC)
 endif()
 target_compile_definitions(libompdevice PRIVATE SHARED_SCRATCHPAD_SIZE=512)
diff --git a/openmp/runtime/test/transform/fuse/foreach.cpp b/openmp/runtime/test/transform/fuse/foreach.cpp
new file mode 100644
index 0000000..176465b
--- /dev/null
+++ b/openmp/runtime/test/transform/fuse/foreach.cpp
@@ -0,0 +1,191 @@
+// RUN: %libomp-cxx20-compile-and-run | FileCheck %s --match-full-lines
+
+#ifndef HEADER
+#define HEADER
+
+#include <cstdlib>
+#include <cstdarg>
+#include <cstdio>
+#include <vector>
+
+struct Reporter {
+  const char *name;
+
+  Reporter(const char *name) : name(name) { print("ctor"); }
+
+  Reporter() : name("<anon>") { print("ctor"); }
+
+  Reporter(const Reporter &that) : name(that.name) { print("copy ctor"); }
+
+  Reporter(Reporter &&that) : name(that.name) { print("move ctor"); }
+
+  ~Reporter() { print("dtor"); }
+
+  const Reporter &operator=(const Reporter &that) {
+    print("copy assign");
+    this->name = that.name;
+    return *this;
+  }
+
+  const Reporter &operator=(Reporter &&that) {
+    print("move assign");
+    this->name = that.name;
+    return *this;
+  }
+
+  struct Iterator {
+    const Reporter *owner;
+    int pos;
+
+    Iterator(const Reporter *owner, int pos) : owner(owner), pos(pos) {}
+
+    Iterator(const Iterator &that) : owner(that.owner), pos(that.pos) {
+      owner->print("iterator copy ctor");
+    }
+
+    Iterator(Iterator &&that) : owner(that.owner), pos(that.pos) {
+      owner->print("iterator move ctor");
+    }
+
+    ~Iterator() { owner->print("iterator dtor"); }
+
+    const Iterator &operator=(const Iterator &that) {
+      owner->print("iterator copy assign");
+      this->owner = that.owner;
+      this->pos = that.pos;
+      return *this;
+    }
+
+    const Iterator &operator=(Iterator &&that) {
+      owner->print("iterator move assign");
+      this->owner = that.owner;
+      this->pos = that.pos;
+      return *this;
+    }
+
+    bool operator==(const Iterator &that) const {
+      owner->print("iterator %d == %d", 2 - this->pos, 2 - that.pos);
+      return this->pos == that.pos;
+    }
+
+    Iterator &operator++() {
+      owner->print("iterator prefix ++");
+      pos -= 1;
+      return *this;
+    }
+
+    Iterator operator++(int) {
+      owner->print("iterator postfix ++");
+      auto result = *this;
+      pos -= 1;
+      return result;
+    }
+
+    int operator*() const {
+      int result = 2 - pos;
+      owner->print("iterator deref: %i", result);
+      return result;
+    }
+
+    size_t operator-(const Iterator &that) const {
+      int result = (2 - this->pos) - (2 - that.pos);
+      owner->print("iterator distance: %d", result);
+      return result;
+    }
+
+    Iterator operator+(int steps) const {
+      owner->print("iterator advance: %i += %i", 2 - this->pos, steps);
+      return Iterator(owner, pos - steps);
+    }
+
+    void print(const char *msg) const { owner->print(msg); }
+  };
+
+  Iterator begin() const {
+    print("begin()");
+    return Iterator(this, 2);
+  }
+
+  Iterator end() const {
+    print("end()");
+    return Iterator(this, -1);
+  }
+
+  void print(const char *msg, ...) const {
+    va_list args;
+    va_start(args, msg);
+    printf("[%s] ", name);
+    vprintf(msg, args);
+    printf("\n");
+    va_end(args);
+  }
+};
+
+int main() {
+  printf("do\n");
+#pragma omp fuse
+  {
+    for (Reporter a{"C"}; auto &&v : Reporter("A"))
+      printf("v=%d\n", v);
+    for (Reporter aa{"D"}; auto &&vv : Reporter("B"))
+      printf("vv=%d\n", vv);
+  }
+  printf("done\n");
+  return EXIT_SUCCESS;
+}
+
+// CHECK: [C] ctor
+// CHECK-NEXT: [A] ctor
+// CHECK-NEXT: [A] end()
+// CHECK-NEXT: [A] begin()
+// CHECK-NEXT: [A] begin()
+// CHECK-NEXT: [A] iterator distance: 3
+// CHECK-NEXT: [D] ctor
+// CHECK-NEXT: [B] ctor
+// CHECK-NEXT: [B] end()
+// CHECK-NEXT: [B] begin()
+// CHECK-NEXT: [B] begin()
+// CHECK-NEXT: [B] iterator distance: 3
+// CHECK-NEXT: [A] iterator advance: 0 += 0
+// CHECK-NEXT: [A] iterator move assign
+// CHECK-NEXT: [A] iterator deref: 0
+// CHECK-NEXT: v=0
+// CHECK-NEXT: [A] iterator dtor
+// CHECK-NEXT: [B] iterator advance: 0 += 0
+// CHECK-NEXT: [B] iterator move assign
+// CHECK-NEXT: [B] iterator deref: 0
+// CHECK-NEXT: vv=0
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [A] iterator advance: 0 += 1
+// CHECK-NEXT: [A] iterator move assign
+// CHECK-NEXT: [A] iterator deref: 1
+// CHECK-NEXT: v=1
+// CHECK-NEXT: [A] iterator dtor
+// CHECK-NEXT: [B] iterator advance: 0 += 1
+// CHECK-NEXT: [B] iterator move assign
+// CHECK-NEXT: [B] iterator deref: 1
+// CHECK-NEXT: vv=1
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [A] iterator advance: 0 += 2
+// CHECK-NEXT: [A] iterator move assign
+// CHECK-NEXT: [A] iterator deref: 2
+// CHECK-NEXT: v=2
+// CHECK-NEXT: [A] iterator dtor
+// CHECK-NEXT: [B] iterator advance: 0 += 2
+// CHECK-NEXT: [B] iterator move assign
+// CHECK-NEXT: [B] iterator deref: 2
+// CHECK-NEXT: vv=2
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [B] dtor
+// CHECK-NEXT: [D] dtor
+// CHECK-NEXT: [A] iterator dtor
+// CHECK-NEXT: [A] iterator dtor
+// CHECK-NEXT: [A] iterator dtor
+// CHECK-NEXT: [A] dtor
+// CHECK-NEXT: [C] dtor
+// CHECK-NEXT: done
+
+#endif
diff --git a/openmp/runtime/test/transform/fuse/intfor.c b/openmp/runtime/test/transform/fuse/intfor.c
new file mode 100644
index 0000000..b8171b4
--- /dev/null
+++ b/openmp/runtime/test/transform/fuse/intfor.c
@@ -0,0 +1,50 @@
+// RUN: %libomp-compile-and-run  | FileCheck %s --match-full-lines
+
+#ifndef HEADER
+#define HEADER
+
+#include <stdlib.h>
+#include <stdio.h>
+
+int main() {
+  printf("do\n");
+#pragma omp fuse
+  {
+    for (int i = 5; i <= 25; i += 5)
+      printf("i=%d\n", i);
+    for (int j = 10; j < 100; j += 10)
+      printf("j=%d\n", j);
+    for (int k = 10; k > 0; --k)
+      printf("k=%d\n", k);
+  }
+  printf("done\n");
+  return EXIT_SUCCESS;
+}
+#endif /* HEADER */
+
+// CHECK: do
+// CHECK-NEXT: i=5
+// CHECK-NEXT: j=10
+// CHECK-NEXT: k=10
+// CHECK-NEXT: i=10
+// CHECK-NEXT: j=20
+// CHECK-NEXT: k=9
+// CHECK-NEXT: i=15
+// CHECK-NEXT: j=30
+// CHECK-NEXT: k=8
+// CHECK-NEXT: i=20
+// CHECK-NEXT: j=40
+// CHECK-NEXT: k=7
+// CHECK-NEXT: i=25
+// CHECK-NEXT: j=50
+// CHECK-NEXT: k=6
+// CHECK-NEXT: j=60
+// CHECK-NEXT: k=5
+// CHECK-NEXT: j=70
+// CHECK-NEXT: k=4
+// CHECK-NEXT: j=80
+// CHECK-NEXT: k=3
+// CHECK-NEXT: j=90
+// CHECK-NEXT: k=2
+// CHECK-NEXT: k=1
+// CHECK-NEXT: done
diff --git a/openmp/runtime/test/transform/fuse/iterfor.cpp b/openmp/runtime/test/transform/fuse/iterfor.cpp
new file mode 100644
index 0000000..552484b
--- /dev/null
+++ b/openmp/runtime/test/transform/fuse/iterfor.cpp
@@ -0,0 +1,194 @@
+// RUN: %libomp-cxx20-compile-and-run | FileCheck %s --match-full-lines
+
+#ifndef HEADER
+#define HEADER
+
+#include <cstdlib>
+#include <cstdarg>
+#include <cstdio>
+#include <vector>
+
+struct Reporter {
+  const char *name;
+
+  Reporter(const char *name) : name(name) { print("ctor"); }
+
+  Reporter() : name("<anon>") { print("ctor"); }
+
+  Reporter(const Reporter &that) : name(that.name) { print("copy ctor"); }
+
+  Reporter(Reporter &&that) : name(that.name) { print("move ctor"); }
+
+  ~Reporter() { print("dtor"); }
+
+  const Reporter &operator=(const Reporter &that) {
+    print("copy assign");
+    this->name = that.name;
+    return *this;
+  }
+
+  const Reporter &operator=(Reporter &&that) {
+    print("move assign");
+    this->name = that.name;
+    return *this;
+  }
+
+  struct Iterator {
+    const Reporter *owner;
+    int pos;
+
+    Iterator(const Reporter *owner, int pos) : owner(owner), pos(pos) {}
+
+    Iterator(const Iterator &that) : owner(that.owner), pos(that.pos) {
+      owner->print("iterator copy ctor");
+    }
+
+    Iterator(Iterator &&that) : owner(that.owner), pos(that.pos) {
+      owner->print("iterator move ctor");
+    }
+
+    ~Iterator() { owner->print("iterator dtor"); }
+
+    const Iterator &operator=(const Iterator &that) {
+      owner->print("iterator copy assign");
+      this->owner = that.owner;
+      this->pos = that.pos;
+      return *this;
+    }
+
+    const Iterator &operator=(Iterator &&that) {
+      owner->print("iterator move assign");
+      this->owner = that.owner;
+      this->pos = that.pos;
+      return *this;
+    }
+
+    bool operator==(const Iterator &that) const {
+      owner->print("iterator %d == %d", 2 - this->pos, 2 - that.pos);
+      return this->pos == that.pos;
+    }
+
+    bool operator!=(const Iterator &that) const {
+      owner->print("iterator %d != %d", 2 - this->pos, 2 - that.pos);
+      return this->pos != that.pos;
+    }
+
+    Iterator &operator++() {
+      owner->print("iterator prefix ++");
+      pos -= 1;
+      return *this;
+    }
+
+    Iterator operator++(int) {
+      owner->print("iterator postfix ++");
+      auto result = *this;
+      pos -= 1;
+      return result;
+    }
+
+    int operator*() const {
+      int result = 2 - pos;
+      owner->print("iterator deref: %i", result);
+      return result;
+    }
+
+    size_t operator-(const Iterator &that) const {
+      int result = (2 - this->pos) - (2 - that.pos);
+      owner->print("iterator distance: %d", result);
+      return result;
+    }
+
+    Iterator operator+(int steps) const {
+      owner->print("iterator advance: %i += %i", 2 - this->pos, steps);
+      return Iterator(owner, pos - steps);
+    }
+  };
+
+  Iterator begin() const {
+    print("begin()");
+    return Iterator(this, 2);
+  }
+
+  Iterator end() const {
+    print("end()");
+    return Iterator(this, -1);
+  }
+
+  void print(const char *msg, ...) const {
+    va_list args;
+    va_start(args, msg);
+    printf("[%s] ", name);
+    vprintf(msg, args);
+    printf("\n");
+    va_end(args);
+  }
+};
+
+int main() {
+  printf("do\n");
+  Reporter C("C");
+  Reporter D("D");
+#pragma omp fuse
+  {
+    for (auto it = C.begin(); it != C.end(); ++it)
+      printf("v=%d\n", *it);
+
+    for (auto it = D.begin(); it != D.end(); ++it)
+      printf("vv=%d\n", *it);
+  }
+  printf("done\n");
+  return EXIT_SUCCESS;
+}
+
+#endif /* HEADER */
+
+// CHECK: do
+// CHECK: [C] ctor
+// CHECK-NEXT: [D] ctor
+// CHECK-NEXT: [C] begin()
+// CHECK-NEXT: [C] begin()
+// CHECK-NEXT: [C] end()
+// CHECK-NEXT: [C] iterator distance: 3
+// CHECK-NEXT: [D] begin()
+// CHECK-NEXT: [D] begin()
+// CHECK-NEXT: [D] end()
+// CHECK-NEXT: [D] iterator distance: 3
+// CHECK-NEXT: [C] iterator advance: 0 += 0
+// CHECK-NEXT: [C] iterator move assign
+// CHECK-NEXT: [C] iterator deref: 0
+// CHECK-NEXT: v=0
+// CHECK-NEXT: [C] iterator dtor
+// CHECK-NEXT: [D] iterator advance: 0 += 0
+// CHECK-NEXT: [D] iterator move assign
+// CHECK-NEXT: [D] iterator deref: 0
+// CHECK-NEXT: vv=0
+// CHECK-NEXT: [D] iterator dtor
+// CHECK-NEXT: [C] iterator advance: 0 += 1
+// CHECK-NEXT: [C] iterator move assign
+// CHECK-NEXT: [C] iterator deref: 1
+// CHECK-NEXT: v=1
+// CHECK-NEXT: [C] iterator dtor
+// CHECK-NEXT: [D] iterator advance: 0 += 1
+// CHECK-NEXT: [D] iterator move assign
+// CHECK-NEXT: [D] iterator deref: 1
+// CHECK-NEXT: vv=1
+// CHECK-NEXT: [D] iterator dtor
+// CHECK-NEXT: [C] iterator advance: 0 += 2
+// CHECK-NEXT: [C] iterator move assign
+// CHECK-NEXT: [C] iterator deref: 2
+// CHECK-NEXT: v=2
+// CHECK-NEXT: [C] iterator dtor
+// CHECK-NEXT: [D] iterator advance: 0 += 2
+// CHECK-NEXT: [D] iterator move assign
+// CHECK-NEXT: [D] iterator deref: 2
+// CHECK-NEXT: vv=2
+// CHECK-NEXT: [D] iterator dtor
+// CHECK-NEXT: [D] iterator dtor
+// CHECK-NEXT: [D] iterator dtor
+// CHECK-NEXT: [C] iterator dtor
+// CHECK-NEXT: [C] iterator dtor
+// CHECK-NEXT: done
+// CHECK-NEXT: [D] iterator dtor
+// CHECK-NEXT: [C] iterator dtor
+// CHECK-NEXT: [D] dtor
+// CHECK-NEXT: [C] dtor
diff --git a/openmp/runtime/test/transform/fuse/parallel-wsloop-collapse-foreach.cpp b/openmp/runtime/test/transform/fuse/parallel-wsloop-collapse-foreach.cpp
new file mode 100644
index 0000000..dcbbdf1
--- /dev/null
+++ b/openmp/runtime/test/transform/fuse/parallel-wsloop-collapse-foreach.cpp
@@ -0,0 +1,207 @@
+// RUN: %libomp-cxx20-compile-and-run | FileCheck %s --match-full-lines
+
+#ifndef HEADER
+#define HEADER
+
+#include <cstdlib>
+#include <cstdarg>
+#include <cstdio>
+#include <vector>
+
+struct Reporter {
+  const char *name;
+
+  Reporter(const char *name) : name(name) { print("ctor"); }
+
+  Reporter() : name("<anon>") { print("ctor"); }
+
+  Reporter(const Reporter &that) : name(that.name) { print("copy ctor"); }
+
+  Reporter(Reporter &&that) : name(that.name) { print("move ctor"); }
+
+  ~Reporter() { print("dtor"); }
+
+  const Reporter &operator=(const Reporter &that) {
+    print("copy assign");
+    this->name = that.name;
+    return *this;
+  }
+
+  const Reporter &operator=(Reporter &&that) {
+    print("move assign");
+    this->name = that.name;
+    return *this;
+  }
+
+  struct Iterator {
+    const Reporter *owner;
+    int pos;
+
+    Iterator(const Reporter *owner, int pos) : owner(owner), pos(pos) {}
+
+    Iterator(const Iterator &that) : owner(that.owner), pos(that.pos) {
+      owner->print("iterator copy ctor");
+    }
+
+    Iterator(Iterator &&that) : owner(that.owner), pos(that.pos) {
+      owner->print("iterator move ctor");
+    }
+
+    ~Iterator() { owner->print("iterator dtor"); }
+
+    const Iterator &operator=(const Iterator &that) {
+      owner->print("iterator copy assign");
+      this->owner = that.owner;
+      this->pos = that.pos;
+      return *this;
+    }
+
+    const Iterator &operator=(Iterator &&that) {
+      owner->print("iterator move assign");
+      this->owner = that.owner;
+      this->pos = that.pos;
+      return *this;
+    }
+
+    bool operator==(const Iterator &that) const {
+      owner->print("iterator %d == %d", 2 - this->pos, 2 - that.pos);
+      return this->pos == that.pos;
+    }
+
+    Iterator &operator++() {
+      owner->print("iterator prefix ++");
+      pos -= 1;
+      return *this;
+    }
+
+    Iterator operator++(int) {
+      owner->print("iterator postfix ++");
+      auto result = *this;
+      pos -= 1;
+      return result;
+    }
+
+    int operator*() const {
+      int result = 2 - pos;
+      owner->print("iterator deref: %i", result);
+      return result;
+    }
+
+    size_t operator-(const Iterator &that) const {
+      int result = (2 - this->pos) - (2 - that.pos);
+      owner->print("iterator distance: %d", result);
+      return result;
+    }
+
+    Iterator operator+(int steps) const {
+      owner->print("iterator advance: %i += %i", 2 - this->pos, steps);
+      return Iterator(owner, pos - steps);
+    }
+
+    void print(const char *msg) const { owner->print(msg); }
+  };
+
+  Iterator begin() const {
+    print("begin()");
+    return Iterator(this, 2);
+  }
+
+  Iterator end() const {
+    print("end()");
+    return Iterator(this, -1);
+  }
+
+  void print(const char *msg, ...) const {
+    va_list args;
+    va_start(args, msg);
+    printf("[%s] ", name);
+    vprintf(msg, args);
+    printf("\n");
+    va_end(args);
+  }
+};
+
+int main() {
+  printf("do\n");
+#pragma omp parallel for collapse(2) num_threads(1)
+  for (int i = 0; i < 3; ++i)
+#pragma omp fuse
+  {
+    for (Reporter c{"init-stmt"}; auto &&v : Reporter("range"))
+      printf("i=%d v=%d\n", i, v);
+    for (int vv = 0; vv < 3; ++vv)
+      printf("i=%d vv=%d\n", i, vv);
+  }
+  printf("done\n");
+  return EXIT_SUCCESS;
+}
+
+#endif /* HEADER */
+
+// CHECK: do
+// CHECK-NEXT: [init-stmt] ctor
+// CHECK-NEXT: [range] ctor
+// CHECK-NEXT: [range] end()
+// CHECK-NEXT: [range] begin()
+// CHECK-NEXT: [range] begin()
+// CHECK-NEXT: [range] iterator distance: 3
+// CHECK-NEXT: [range] iterator advance: 0 += 0
+// CHECK-NEXT: [range] iterator move assign
+// CHECK-NEXT: [range] iterator deref: 0
+// CHECK-NEXT: i=0 v=0
+// CHECK-NEXT: [range] iterator dtor
+// CHECK-NEXT: i=0 vv=0
+// CHECK-NEXT: [range] iterator advance: 0 += 1
+// CHECK-NEXT: [range] iterator move assign
+// CHECK-NEXT: [range] iterator deref: 1
+// CHECK-NEXT: i=0 v=1
+// CHECK-NEXT: [range] iterator dtor
+// CHECK-NEXT: i=0 vv=1
+// CHECK-NEXT: [range] iterator advance: 0 += 2
+// CHECK-NEXT: [range] iterator move assign
+// CHECK-NEXT: [range] iterator deref: 2
+// CHECK-NEXT: i=0 v=2
+// CHECK-NEXT: [range] iterator dtor
+// CHECK-NEXT: i=0 vv=2
+// CHECK-NEXT: [range] iterator advance: 0 += 0
+// CHECK-NEXT: [range] iterator move assign
+// CHECK-NEXT: [range] iterator deref: 0
+// CHECK-NEXT: i=1 v=0
+// CHECK-NEXT: [range] iterator dtor
+// CHECK-NEXT: i=1 vv=0
+// CHECK-NEXT: [range] iterator advance: 0 += 1
+// CHECK-NEXT: [range] iterator move assign
+// CHECK-NEXT: [range] iterator deref: 1
+// CHECK-NEXT: i=1 v=1
+// CHECK-NEXT: [range] iterator dtor
+// CHECK-NEXT: i=1 vv=1
+// CHECK-NEXT: [range] iterator advance: 0 += 2
+// CHECK-NEXT: [range] iterator move assign
+// CHECK-NEXT: [range] iterator deref: 2
+// CHECK-NEXT: i=1 v=2
+// CHECK-NEXT: [range] iterator dtor
+// CHECK-NEXT: i=1 vv=2
+// CHECK-NEXT: [range] iterator advance: 0 += 0
+// CHECK-NEXT: [range] iterator move assign
+// CHECK-NEXT: [range] iterator deref: 0
+// CHECK-NEXT: i=2 v=0
+// CHECK-NEXT: [range] iterator dtor
+// CHECK-NEXT: i=2 vv=0
+// CHECK-NEXT: [range] iterator advance: 0 += 1
+// CHECK-NEXT: [range] iterator move assign
+// CHECK-NEXT: [range] iterator deref: 1
+// CHECK-NEXT: i=2 v=1
+// CHECK-NEXT: [range] iterator dtor
+// CHECK-NEXT: i=2 vv=1
+// CHECK-NEXT: [range] iterator advance: 0 += 2
+// CHECK-NEXT: [range] iterator move assign
+// CHECK-NEXT: [range] iterator deref: 2
+// CHECK-NEXT: i=2 v=2
+// CHECK-NEXT: [range] iterator dtor
+// CHECK-NEXT: i=2 vv=2
+// CHECK-NEXT: [range] iterator dtor
+// CHECK-NEXT: [range] iterator dtor
+// CHECK-NEXT: [range] iterator dtor
+// CHECK-NEXT: [range] dtor
+// CHECK-NEXT: [init-stmt] dtor
+// CHECK-NEXT: done
diff --git a/openmp/runtime/test/transform/fuse/parallel-wsloop-collapse-intfor.c b/openmp/runtime/test/transform/fuse/parallel-wsloop-collapse-intfor.c
new file mode 100644
index 0000000..9630fec
--- /dev/null
+++ b/openmp/runtime/test/transform/fuse/parallel-wsloop-collapse-intfor.c
@@ -0,0 +1,45 @@
+// RUN: %libomp-cxx-compile-and-run | FileCheck %s --match-full-lines
+
+#ifndef HEADER
+#define HEADER
+
+#include <cstdlib>
+#include <cstdio>
+
+int main() {
+  printf("do\n");
+#pragma omp parallel for collapse(2) num_threads(1)
+  for (int i = 0; i < 3; ++i)
+#pragma omp fuse
+  {
+    for (int j = 0; j < 3; ++j)
+      printf("i=%d j=%d\n", i, j);
+    for (int k = 0; k < 3; ++k)
+      printf("i=%d k=%d\n", i, k);
+  }
+  printf("done\n");
+  return EXIT_SUCCESS;
+}
+
+#endif /* HEADER */
+
+// CHECK:      do
+// CHECK-NEXT: i=0 j=0
+// CHECK-NEXT: i=0 k=0
+// CHECK-NEXT: i=0 j=1
+// CHECK-NEXT: i=0 k=1
+// CHECK-NEXT: i=0 j=2
+// CHECK-NEXT: i=0 k=2
+// CHECK-NEXT: i=1 j=0
+// CHECK-NEXT: i=1 k=0
+// CHECK-NEXT: i=1 j=1
+// CHECK-NEXT: i=1 k=1
+// CHECK-NEXT: i=1 j=2
+// CHECK-NEXT: i=1 k=2
+// CHECK-NEXT: i=2 j=0
+// CHECK-NEXT: i=2 k=0
+// CHECK-NEXT: i=2 j=1
+// CHECK-NEXT: i=2 k=1
+// CHECK-NEXT: i=2 j=2
+// CHECK-NEXT: i=2 k=2
+// CHECK-NEXT: done
diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
index 9d02ff9..8d9e803 100644
--- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
@@ -2658,6 +2658,22 @@ libc_support_library(
 )
 
 libc_support_library(
+    name = "__support_math_exp10m1f",
+    hdrs = ["src/__support/math/exp10m1f.h"],
+    deps = [
+        ":__support_fputil_except_value_utils",
+        ":__support_fputil_fenv_impl",
+        ":__support_fputil_fp_bits",
+        ":__support_fputil_multiply_add",
+        ":__support_fputil_polyeval",
+        ":__support_fputil_rounding_mode",
+        ":__support_macros_optimization",
+        ":__support_math_exp10f_utils",
+        ":errno",
+    ],
+)
+
+libc_support_library(
     name = "__support_math_erff",
     hdrs = ["src/__support/math/erff.h"],
     deps = [
@@ -3613,7 +3629,7 @@ libc_math_function(
 libc_math_function(
     name = "exp10m1f",
     additional_deps = [
-        ":__support_math_exp10f_utils",
+        ":__support_math_exp10m1f",
     ],
 )
 
diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
index 2a0cc30..422c29f 100644
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -6091,6 +6091,7 @@ cc_library(
         ":DialectUtils",
         ":GPUDialect",
         ":IR",
+        ":InferIntRangeInterface",
         ":LLVMDialect",
         ":NVVMOpsIncGen",
         ":NVVMRequiresSMTraitsIncGen",
@@ -6295,6 +6296,7 @@ cc_library(
         ":BytecodeOpInterface",
         ":GPUDialect",
         ":IR",
+        ":InferIntRangeInterface",
         ":LLVMDialect",
         ":ROCDLOpsIncGen",
         ":SideEffectInterfaces",
@@ -6685,6 +6687,7 @@ cc_library(
         ":IR",
         ":InferTypeOpInterface",
         ":SMTIncGen",
+        ":SideEffectInterfaces",
         ":Support",
         "//llvm:Support",
     ],
@@ -11831,6 +11834,7 @@ cc_library(
     srcs = glob(["lib/Dialect/Transform/PDLExtension/*.cpp"]),
     hdrs = glob(["include/mlir/Dialect/Transform/PDLExtension/*.h"]),
     deps = [
+        ":BytecodeOpInterface",
         ":IR",
         ":PDLDialect",
         ":PDLInterpDialect",
@@ -11945,6 +11949,7 @@ cc_library(
     srcs = glob(["lib/Dialect/Transform/IRDLExtension/*.cpp"]),
     hdrs = glob(["include/mlir/Dialect/Transform/IRDLExtension/*.h"]),
     deps = [
+        ":BytecodeOpInterface",
         ":IR",
         ":IRDLDialect",
         ":IRDLInterfacesIncGen",
@@ -11986,7 +11991,9 @@ cc_library(
     srcs = glob(["lib/Dialect/Transform/DebugExtension/*.cpp"]),
     hdrs = glob(["include/mlir/Dialect/Transform/DebugExtension/*.h"]),
     deps = [
+        ":BytecodeOpInterface",
         ":IR",
+        ":SideEffectInterfaces",
         ":Support",
         ":TransformDebugExtensionOpsIncGen",
         ":TransformDialect",
@@ -12023,6 +12030,7 @@ cc_library(
     srcs = glob(["lib/Dialect/Transform/LoopExtension/*.cpp"]),
     hdrs = glob(["include/mlir/Dialect/Transform/LoopExtension/*.h"]),
     deps = [
+        ":BytecodeOpInterface",
         ":IR",
         ":LoopLikeInterface",
         ":Rewrite",
@@ -13071,6 +13079,7 @@ cc_library(
         ":MPIOpsIncGen",
         ":MPITypesIncGen",
         ":MemRefDialect",
+        ":SideEffectInterfaces",
         "//llvm:Support",
     ],
 )