241 files changed, 8269 insertions, 2295 deletions
diff --git a/.ci/generate_test_report_github.py b/.ci/generate_test_report_github.py
index 7242264..6785e82 100644
--- a/.ci/generate_test_report_github.py
+++ b/.ci/generate_test_report_github.py
@@ -8,10 +8,15 @@ import platform
 
 import generate_test_report_lib
 
-PLATFORM_TITLES = {
-    "Windows": ":window: Windows x64 Test Results",
-    "Linux": ":penguin: Linux x64 Test Results",
-}
+def compute_platform_title() -> str:
+    logo = ":window:" if platform.system() == "Windows" else ":penguin:"
+    # On Linux the machine value is x86_64 on Windows it is AMD64.
+    if platform.machine() == "x86_64" or platform.machine() == "AMD64":
+        arch = "x64"
+    else:
+        arch = platform.machine()
+    return f"{logo} {platform.system()} {arch} Test Results"
+
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
@@ -22,7 +27,7 @@ if __name__ == "__main__":
     args = parser.parse_args()
 
     report = generate_test_report_lib.generate_report_from_files(
-        PLATFORM_TITLES[platform.system()], args.return_code, args.build_test_logs
+        compute_platform_title(), args.return_code, args.build_test_logs
     )
 
     print(report)
diff --git a/.ci/utils.sh b/.ci/utils.sh
index 87afbbd..5d32968 100644
--- a/.ci/utils.sh
+++ b/.ci/utils.sh
@@ -56,6 +56,7 @@ function start-group {
 export PIP_BREAK_SYSTEM_PACKAGES=1
 pip install -q -r "${MONOREPO_ROOT}"/.ci/all_requirements.txt
 
-if [[ "$GITHUB_ACTIONS" != "" ]]; then
+# The ARM64 builders run on AWS and don't have access to the GCS cache.
+if [[ "$GITHUB_ACTIONS" != "" ]] && [[ "$RUNNER_ARCH" != "ARM64" ]]; then
   python .ci/cache_lit_timing_files.py download
 fi
diff --git a/.github/workflows/premerge.yaml b/.github/workflows/premerge.yaml
index a9c107e..03c0c01 100644
--- a/.github/workflows/premerge.yaml
+++ b/.github/workflows/premerge.yaml
@@ -24,17 +24,45 @@ concurrency:
 
 jobs:
   premerge-checks-linux:
-    name: Build and Test Linux
+    name: Build and Test Linux${{ (startsWith(matrix.runs-on, 'depot-ubuntu-24.04-arm') && ' AArch64') || '' }}
     if: >-
         github.repository_owner == 'llvm' &&
         (github.event_name != 'pull_request' || github.event.action != 'closed')
-    runs-on: llvm-premerge-linux-runners
+    strategy:
+      fail-fast: false
+      matrix:
+        runs-on:
+          - depot-ubuntu-24.04-arm-16
+          - llvm-premerge-linux-runners
+    runs-on: ${{ matrix.runs-on }}
+    container:
+      # The llvm-premerge agents are already containers and running the
+      # this same image, so we can't use a container for the github action
+      # job.  The depot containers are running on VMs, so we can use a
+      # container.  This helps ensure the build environment is as close
+      # as possible on both the depot runners and the llvm-premerge runners.
+      image: ${{ (startsWith(matrix.runs-on, 'depot-ubuntu-24.04-arm') && format('ghcr.io/{0}/arm64v8/ci-ubuntu-24.04',github.repository_owner) ) || null }}
+      # --privileged is needed to run the lldb tests that disable aslr.
+      # The SCCACHE environment variables are need to be copied from the host
+      # to the container to make sure it is configured correctly to use the
+      # depot cache.
+      options: >-
+         --privileged
+         --env SCCACHE_WEBDAV_ENDPOINT
+         --env SCCACHE_WEBDAV_TOKEN
+    defaults:
+      run:
+        # The run step defaults to using sh as the shell when running in a
+        # container, so make bash the default to ensure consistency between
+        # container and non-container jobs.
+        shell: bash
     steps:
       - name: Checkout LLVM
         uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
         with:
           fetch-depth: 2
       - name: Build and Test
+        continue-on-error: ${{ runner.arch == 'ARM64' }}
         run: |
           git config --global --add safe.directory '*'
 
@@ -54,11 +82,16 @@ jobs:
           export CC=/opt/llvm/bin/clang
           export CXX=/opt/llvm/bin/clang++
 
-          # This environment variable is passes into the container through the
-          # runner pod definition. This differs between our two clusters which
-          # why we do not hardcode it.
-          export SCCACHE_GCS_BUCKET=$CACHE_GCS_BUCKET
-          export SCCACHE_GCS_RW_MODE=READ_WRITE
+          # The linux-premerge runners are hosted on GCP and have a different
+          # cache setup than the depot runners.
+          if [[ "${{ matrix.runs-on }}" = "llvm-premerge-linux-runners" ]]; then
+            # This environment variable is passes into the container through the
+            # runner pod definition. This differs between our two clusters which
+            # why we do not hardcode it.
+            export SCCACHE_GCS_BUCKET=$CACHE_GCS_BUCKET
+            export SCCACHE_GCS_RW_MODE=READ_WRITE
+          fi
+          env
 
           # Set the idle timeout to zero to ensure sccache runs for the
           # entire duration of the job. Otherwise it might stop if we run
@@ -78,7 +111,7 @@ jobs:
         if: '!cancelled()'
         uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
         with:
-          name: Premerge Artifacts (Linux)
+          name: Premerge Artifacts (Linux ${{ runner.arch }})
           path: artifacts/
           retention-days: 5
           include-hidden-files: 'true'
diff --git a/bolt/lib/Rewrite/RewriteInstance.cpp b/bolt/lib/Rewrite/RewriteInstance.cpp
index c428828..a0e7995 100644
--- a/bolt/lib/Rewrite/RewriteInstance.cpp
+++ b/bolt/lib/Rewrite/RewriteInstance.cpp
@@ -1514,6 +1514,12 @@ void RewriteInstance::registerFragments() {
       }
       if (BD) {
         BinaryFunction *BF = BC->getFunctionForSymbol(BD->getSymbol());
+        if (BF == &Function) {
+          BC->errs()
+              << "BOLT-WARNING: fragment maps to the same function as parent: "
+              << Function << '\n';
+          continue;
+        }
         if (BF) {
           BC->registerFragment(Function, *BF);
           continue;
@@ -4005,10 +4011,12 @@ void RewriteInstance::mapCodeSections(BOLTLinker::SectionMapper MapSection) {
       BC->outs() << '\n';
       AllocationDone = true;
     } else {
-      BC->errs() << "BOLT-WARNING: original .text too small to fit the new code"
-                 << " using 0x" << Twine::utohexstr(opts::AlignText)
-                 << " alignment. " << CodeSize << " bytes needed, have "
-                 << BC->OldTextSectionSize << " bytes available.\n";
+      BC->errs() << "BOLT-WARNING: --use-old-text failed. The original .text "
+                    "too small to fit the new code using 0x"
+                 << Twine::utohexstr(opts::AlignText) << " alignment. "
+                 << CodeSize << " bytes needed, have " << BC->OldTextSectionSize
+                 << " bytes available. Rebuilding without --use-old-text may "
+                    "produce a smaller binary\n";
       opts::UseOldText = false;
     }
   }
diff --git a/bolt/test/X86/fragment-alias.s b/bolt/test/X86/fragment-alias.s
new file mode 100644
index 0000000..3392dd5
--- /dev/null
+++ b/bolt/test/X86/fragment-alias.s
@@ -0,0 +1,13 @@
+## This test reproduces the issue where a fragment has the same address as
+## parent function.
+# RUN: llvm-mc --filetype=obj --triple x86_64-unknown-unknown %s -o %t.o
+# RUN: %clang %cflags %t.o -o %t
+# RUN: llvm-bolt %t -o %t.out 2>&1 | FileCheck %s
+# CHECK: BOLT-WARNING: fragment maps to the same function as parent: main/1(*2)
+.type main, @function
+.type main.cold, @function
+main.cold:
+main:
+  ret
+.size main, .-main
+.size main.cold, .-main.cold
diff --git a/clang-tools-extra/clang-doc/HTMLMustacheGenerator.cpp b/clang-tools-extra/clang-doc/HTMLMustacheGenerator.cpp
index b37dc27..b4b9322 100644
--- a/clang-tools-extra/clang-doc/HTMLMustacheGenerator.cpp
+++ b/clang-tools-extra/clang-doc/HTMLMustacheGenerator.cpp
@@ -46,7 +46,13 @@ public:
                            const ClangDocContext &CDCtx) override;
 };
 
-class MustacheTemplateFile : public Template {
+class MustacheTemplateFile {
+  BumpPtrAllocator Allocator;
+  StringSaver Saver;
+  MustacheContext Ctx;
+  Template T;
+  std::unique_ptr<MemoryBuffer> Buffer;
+
 public:
   static Expected<std::unique_ptr<MustacheTemplateFile>>
   createMustacheFile(StringRef FileName) {
@@ -54,10 +60,8 @@ public:
         MemoryBuffer::getFile(FileName);
     if (auto EC = BufferOrError.getError())
       return createFileOpenError(FileName, EC);
-
-    std::unique_ptr<MemoryBuffer> Buffer = std::move(BufferOrError.get());
-    StringRef FileContent = Buffer->getBuffer();
-    return std::make_unique<MustacheTemplateFile>(FileContent);
+    return std::make_unique<MustacheTemplateFile>(
+        std::move(BufferOrError.get()));
   }
 
   Error registerPartialFile(StringRef Name, StringRef FileName) {
@@ -68,11 +72,15 @@ public:
 
     std::unique_ptr<MemoryBuffer> Buffer = std::move(BufferOrError.get());
     StringRef FileContent = Buffer->getBuffer();
-    registerPartial(Name.str(), FileContent.str());
+    T.registerPartial(Name.str(), FileContent.str());
     return Error::success();
   }
 
-  MustacheTemplateFile(StringRef TemplateStr) : Template(TemplateStr) {}
+  void render(json::Value &V, raw_ostream &OS) { T.render(V, OS); }
+
+  MustacheTemplateFile(std::unique_ptr<MemoryBuffer> &&B)
+      : Saver(Allocator), Ctx(Allocator, Saver), T(B->getBuffer(), Ctx),
+        Buffer(std::move(B)) {}
 };
 
 static std::unique_ptr<MustacheTemplateFile> NamespaceTemplate = nullptr;
diff --git a/clang-tools-extra/docs/clang-tidy/checks/list.rst b/clang-tools-extra/docs/clang-tidy/checks/list.rst
index 4139184..a324d18 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/list.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/list.rst
@@ -372,7 +372,7 @@ Clang-Tidy Checks
    :doc:`readability-avoid-nested-conditional-operator <readability/avoid-nested-conditional-operator>`,
    :doc:`readability-avoid-return-with-void-value <readability/avoid-return-with-void-value>`, "Yes"
    :doc:`readability-avoid-unconditional-preprocessor-if <readability/avoid-unconditional-preprocessor-if>`,
-   :doc:`readability-braces-around-statements <readability/braces-around-statements>`,
+   :doc:`readability-braces-around-statements <readability/braces-around-statements>`, "Yes"
    :doc:`readability-const-return-type <readability/const-return-type>`, "Yes"
    :doc:`readability-container-contains <readability/container-contains>`, "Yes"
    :doc:`readability-container-data-pointer <readability/container-data-pointer>`, "Yes"
@@ -571,12 +571,12 @@ Check aliases
    :doc:`cppcoreguidelines-non-private-member-variables-in-classes <cppcoreguidelines/non-private-member-variables-in-classes>`, :doc:`misc-non-private-member-variables-in-classes <misc/non-private-member-variables-in-classes>`,
    :doc:`cppcoreguidelines-use-default-member-init <cppcoreguidelines/use-default-member-init>`, :doc:`modernize-use-default-member-init <modernize/use-default-member-init>`, "Yes"
    :doc:`fuchsia-header-anon-namespaces <fuchsia/header-anon-namespaces>`, :doc:`google-build-namespaces <google/build-namespaces>`,
-   :doc:`google-readability-braces-around-statements <google/readability-braces-around-statements>`, :doc:`readability-braces-around-statements <readability/braces-around-statements>`,
+   :doc:`google-readability-braces-around-statements <google/readability-braces-around-statements>`, :doc:`readability-braces-around-statements <readability/braces-around-statements>`, "Yes"
    :doc:`google-readability-function-size <google/readability-function-size>`, :doc:`readability-function-size <readability/function-size>`,
    :doc:`google-readability-namespace-comments <google/readability-namespace-comments>`, :doc:`llvm-namespace-comment <llvm/namespace-comment>`,
    :doc:`hicpp-avoid-c-arrays <hicpp/avoid-c-arrays>`, :doc:`modernize-avoid-c-arrays <modernize/avoid-c-arrays>`,
    :doc:`hicpp-avoid-goto <hicpp/avoid-goto>`, :doc:`cppcoreguidelines-avoid-goto <cppcoreguidelines/avoid-goto>`,
-   :doc:`hicpp-braces-around-statements <hicpp/braces-around-statements>`, :doc:`readability-braces-around-statements <readability/braces-around-statements>`,
+   :doc:`hicpp-braces-around-statements <hicpp/braces-around-statements>`, :doc:`readability-braces-around-statements <readability/braces-around-statements>`, "Yes"
    :doc:`hicpp-deprecated-headers <hicpp/deprecated-headers>`, :doc:`modernize-deprecated-headers <modernize/deprecated-headers>`, "Yes"
    :doc:`hicpp-explicit-conversions <hicpp/explicit-conversions>`, :doc:`google-explicit-constructor <google/explicit-constructor>`, "Yes"
    :doc:`hicpp-function-size <hicpp/function-size>`, :doc:`readability-function-size <readability/function-size>`,
diff --git a/clang/docs/DebuggingCoroutines.rst b/clang/docs/DebuggingCoroutines.rst
index 9eaf8d4..c62e2ea 100644
--- a/clang/docs/DebuggingCoroutines.rst
+++ b/clang/docs/DebuggingCoroutines.rst
@@ -179,8 +179,8 @@ generator and its internal state.
 
 To do so, we can simply look into the ``gen.hdl`` variable. LLDB comes with a
 pretty printer for ``std::coroutine_handle`` which will show us the internal
-state of the coroutine. For GDB, you will have to use the ``show-coro-frame``
-command provided by the :ref:`gdb-script`.
+state of the coroutine. For GDB, the pretty printer is provided by a script,
+see :ref:`gdb-script` for setup instructions.
 
 .. image:: ./coro-generator-suspended.png
 
@@ -206,23 +206,16 @@ Tracking the exact suspension point
 
 Among the compiler-generated members, the ``__coro_index`` is particularly
 important. This member identifies the suspension point at which the coroutine
-is currently suspended.
+is currently suspended. However, it is non-trivial to map this number back to
+a source code location.
 
-However, it is non-trivial to map this number back to a source code location.
-The compiler emits debug info labels for the suspension points. This allows us
-to map the suspension point index back to a source code location. In gdb, we
-can use the ``info line`` command to get the source code location of the
-suspension point.
+For GDB, the provided :ref:`gdb-script` already takes care of this and provides
+the exact line number of the suspension point as part of the coroutine handle's
+summary string. Unfortunately, LLDB's pretty-printer does not support this, yet.
+Furthermore, those labels are only emitted starting with clang 21.0.
 
-::
-
-  (gdb) info line -function coro_task -label __coro_resume_2
-  Line 45 of "llvm-example.cpp" starts at address 0x1b1b <_ZL9coro_taski.resume+555> and ends at 0x1b46 <_ZL9coro_taski.resume+598>.
-  Line 45 of "llvm-example.cpp" starts at address 0x201b <_ZL9coro_taski.destroy+555> and ends at 0x2046 <_ZL9coro_taski.destroy+598>.
-  Line 45 of "llvm-example.cpp" starts at address 0x253b <_ZL9coro_taski.cleanup+555> and ends at 0x2566 <_ZL9coro_taski.cleanup+598>.
-
-LLDB does not support looking up labels. Furthermore, those labels are only emitted
-starting with clang 21.0.
+When debugging with LLDB or when using older clang versions, we will have to use
+a different approach.
 
 For simple cases, you might still be able to guess the suspension point correctly.
 Alternatively, you might also want to modify your coroutine library to store
@@ -655,33 +648,17 @@ There are two possible approaches to do so:
    We can lookup their types and thereby get the types of promise
    and coroutine frame.
 
-In gdb, one can use the following approach to devirtualize a coroutine type,
-assuming we have a ``std::coroutine_handle`` is at address 0x418eb0:
-
-::
+In general, the second approach is preferred, as it is more portable.
 
-  (gdb) # Get the address of coroutine frame
-  (gdb) print/x *0x418eb0
-  $1 = 0x4019e0
-  (gdb) # Get the linkage name for the coroutine
-  (gdb) x 0x4019e0
-  0x4019e0 <_ZL9coro_taski>:  0xe5894855
-  (gdb) # Turn off the demangler temporarily to avoid the debugger misunderstanding the name.
-  (gdb) set demangle-style none
-  (gdb) # The coroutine frame type is 'linkage_name.coro_frame_ty'
-  (gdb) print  ('_ZL9coro_taski.coro_frame_ty')*(0x418eb0)
-  $2 = {__resume_fn = 0x4019e0 <coro_task(int)>, __destroy_fn = 0x402000 <coro_task(int)>, __promise = {...}, ...}
-
-In practice, one would use the ``show-coro-frame`` command provided by the
-:ref:`gdb-script`.
+To do so, we look up the types in the destroy function and not the resume function
+because the resume function pointer will be set to a ``nullptr`` as soon as a
+coroutine reaches its final suspension point. If we used the resume function,
+devirtualization would hence fail for all coroutines that have reached their final
+suspension point.
 
 LLDB comes with devirtualization support out of the box, as part of the
-pretty-printer for ``std::coroutine_handle``. Internally, this pretty-printer
-uses the second approach. We look up the types in the destroy function and not
-the resume function because the resume function pointer will be set to a
-``nullptr`` as soon as a coroutine reaches its final suspension point. If we used
-the resume function, devirtualization would hence fail for all coroutines that
-have reached their final suspension point.
+pretty-printer for ``std::coroutine_handle``. For GDB, a similar pretty-printer
+is provided by the :ref:`gdb-script`.
 
 Interpreting the coroutine frame in optimized builds
 ----------------------------------------------------
@@ -756,6 +733,26 @@ should not be thought of as directly representing the variables in the C++
 source.
 
 
+Mapping suspension point indices to source code locations
+---------------------------------------------------------
+
+To aid in mapping a ``__coro_index`` back to a source code location, clang 21.0
+and newer emit special, compiler-generated labels for the suspension points.
+
+In gdb, we can use the ``info line`` command to get the source code location of
+the suspension point.
+
+::
+
+  (gdb) info line -function coro_task -label __coro_resume_2
+  Line 45 of "llvm-example.cpp" starts at address 0x1b1b <_ZL9coro_taski.resume+555> and ends at 0x1b46 <_ZL9coro_taski.resume+598>.
+  Line 45 of "llvm-example.cpp" starts at address 0x201b <_ZL9coro_taski.destroy+555> and ends at 0x2046 <_ZL9coro_taski.destroy+598>.
+  Line 45 of "llvm-example.cpp" starts at address 0x253b <_ZL9coro_taski.cleanup+555> and ends at 0x2566 <_ZL9coro_taski.cleanup+598>.
+
+LLDB does not support looking up labels, yet. For this reason, LLDB's pretty-printer
+does not show the exact line number of the suspension point.
+
+
 Resources
 =========
 
@@ -1017,156 +1014,270 @@ Note that this script requires LLDB 21.0 or newer.
 GDB Debugger Script
 -------------------
 
-For GDB, the following script provides a couple of useful commands:
+The following script provides:
 
-* ``async-bt`` to print the stack trace of a coroutine
-* ``show-coro-frame`` to print the coroutine frame, similar to
-  LLDB's builtin pretty-printer for coroutine frames
+* a pretty-printer for coroutine handles
+* a frame filter to add coroutine frames to the built-in ``bt`` command
+* the ``get_coro_frame`` and ``get_coro_promise`` functions to be used in
+  expressions, e.g. ``p get_coro_promise(fib.coro_hdl)->current_state``
+
+It can be loaded into GDB using ``source gdb_coro_debugging.py``.
+To load this by default, add this command to your ``~/.gdbinit`` file.
 
 .. code-block:: python
 
-  # debugging-helper.py
+  # gdb_coro_debugging.py
   import gdb
   from gdb.FrameDecorator import FrameDecorator
 
-  class SymValueWrapper():
-      def __init__(self, symbol, value):
-          self.sym = symbol
-          self.val = value
+  import typing
+  import re
 
-      def __str__(self):
-          return str(self.sym) + " = " + str(self.val)
+  def _load_pointer_at(addr: int):
+      return gdb.Value(addr).reinterpret_cast(gdb.lookup_type('void').pointer().pointer()).dereference()
 
-  def get_long_pointer_size():
-      return gdb.lookup_type('long').pointer().sizeof
+  """
+  Devirtualized coroutine frame.
 
-  def cast_addr2long_pointer(addr):
-      return gdb.Value(addr).cast(gdb.lookup_type('long').pointer())
+  Devirtualizes the promise and frame pointer types by inspecting
+  the destroy function.
 
-  def dereference(addr):
-      return long(cast_addr2long_pointer(addr).dereference())
+  Implements `to_string` and `children` to be used by `gdb.printing.PrettyPrinter`.
+  Base class for `CoroutineHandlePrinter`.
+  """
+  class DevirtualizedCoroFrame:
+      def __init__(self, frame_ptr_raw: int, val: gdb.Value | None = None):
+          self.val = val
+          self.frame_ptr_raw = frame_ptr_raw
 
-  class CoroutineFrame(object):
-      def __init__(self, task_addr):
-          self.frame_addr = task_addr
-          self.resume_addr = task_addr
-          self.destroy_addr = task_addr + get_long_pointer_size()
-          self.promise_addr = task_addr + get_long_pointer_size() * 2
-          # In the example, the continuation is the first field member of the promise_type.
-          # So they have the same addresses.
-          # If we want to generalize the scripts to other coroutine types, we need to be sure
-          # the continuation field is the first member of promise_type.
-          self.continuation_addr = self.promise_addr
+          # Get the resume and destroy pointers.
+          if frame_ptr_raw == 0:
+              self.resume_ptr = None
+              self.destroy_ptr = None
+              self.promise_ptr = None
+              self.frame_ptr = gdb.Value(frame_ptr_raw).reinterpret_cast(gdb.lookup_type("void").pointer())
+              return
 
-      def next_task_addr(self):
-          return dereference(self.continuation_addr)
+          # Get the resume and destroy pointers.
+          self.resume_ptr = _load_pointer_at(frame_ptr_raw)
+          self.destroy_ptr = _load_pointer_at(frame_ptr_raw + 8)
+
+          # Devirtualize the promise and frame pointer types.
+          frame_type = gdb.lookup_type("void")
+          promise_type = gdb.lookup_type("void")
+          self.destroy_func = gdb.block_for_pc(int(self.destroy_ptr))
+          if self.destroy_func is not None:
+              frame_var = gdb.lookup_symbol("__coro_frame", self.destroy_func, gdb.SYMBOL_VAR_DOMAIN)[0]
+              if frame_var is not None:
+                  frame_type = frame_var.type
+              promise_var = gdb.lookup_symbol("__promise", self.destroy_func, gdb.SYMBOL_VAR_DOMAIN)[0]
+              if promise_var is not None:
+                  promise_type = promise_var.type.strip_typedefs()
+
+          # If the type has a template argument, prefer it over the devirtualized type.
+          if self.val is not None:
+              promise_type_template_arg = self.val.type.template_argument(0)
+              if promise_type_template_arg is not None and promise_type_template_arg.code != gdb.TYPE_CODE_VOID:
+                  promise_type = promise_type_template_arg
+
+          self.promise_ptr = gdb.Value(frame_ptr_raw + 16).reinterpret_cast(promise_type.pointer())
+          self.frame_ptr = gdb.Value(frame_ptr_raw).reinterpret_cast(frame_type.pointer())
+
+          # Try to get the suspension point index and look up the exact line entry.
+          self.suspension_point_index = int(self.frame_ptr.dereference()["__coro_index"]) if frame_type.code == gdb.TYPE_CODE_STRUCT else None
+          self.resume_func = gdb.block_for_pc(int(self.resume_ptr))
+          self.resume_label = None
+          if self.resume_func is not None and self.suspension_point_index is not None:
+              label_name = f"__coro_resume_{self.suspension_point_index}"
+              self.resume_label = gdb.lookup_symbol(label_name, self.resume_func, gdb.SYMBOL_LABEL_DOMAIN)[0]
+
+      def get_function_name(self):
+          if self.destroy_func is None:
+              return None
+          name = self.destroy_func.function.name
+          # Strip the "clone" suffix if it exists.
+          if "() [clone " in name:
+              name = name[:name.index("() [clone ")]
+          return name
+
+      def to_string(self):
+          result = "coro(" + str(self.frame_ptr_raw) + ")"
+          if self.destroy_func is not None:
+              result += ": " + self.get_function_name()
+          if self.resume_label is not None:
+              result += ", line " + str(self.resume_label.line)
+          if self.suspension_point_index is not None:
+              result += ", suspension point " + str(self.suspension_point_index)
+          return result
+
+      def children(self):
+          if self.resume_ptr is None:
+              return [
+                  ("coro_frame", self.frame_ptr),
+              ]
+          else:
+              return [
+                  ("resume", self.resume_ptr),
+                  ("destroy", self.destroy_ptr),
+                  ("promise", self.promise_ptr),
+                  ("coro_frame", self.frame_ptr)
+              ]
 
-  class CoroutineFrameDecorator(FrameDecorator):
-      def __init__(self, coro_frame):
-          super(CoroutineFrameDecorator, self).__init__(None)
-          self.coro_frame = coro_frame
-          self.resume_func = dereference(self.coro_frame.resume_addr)
-          self.resume_func_block = gdb.block_for_pc(self.resume_func)
-          if self.resume_func_block is None:
-              raise Exception('Not stackless coroutine.')
-          self.line_info = gdb.find_pc_line(self.resume_func)
 
-      def address(self):
-          return self.resume_func
+  # Works for both libc++ and libstdc++.
+  libcxx_corohdl_regex = re.compile('^std::__[A-Za-z0-9]+::coroutine_handle<.+>$|^std::coroutine_handle<.+>(( )?&)?$')
 
-      def filename(self):
-          return self.line_info.symtab.filename
+  def _extract_coro_frame_ptr_from_handle(val: gdb.Value):
+      if libcxx_corohdl_regex.match(val.type.strip_typedefs().name) is None:
+          raise ValueError("Expected a std::coroutine_handle, got %s" % val.type.strip_typedefs().name)
 
-      def frame_args(self):
-          return [SymValueWrapper("frame_addr", cast_addr2long_pointer(self.coro_frame.frame_addr)),
-                  SymValueWrapper("promise_addr", cast_addr2long_pointer(self.coro_frame.promise_addr)),
-                  SymValueWrapper("continuation_addr", cast_addr2long_pointer(self.coro_frame.continuation_addr))
-                  ]
+      # We expect the coroutine handle to have a single field, which is the frame pointer.
+      # This heuristic works for both libc++ and libstdc++.
+      fields = val.type.fields()
+      if len(fields) != 1:
+          raise ValueError("Expected 1 field, got %d" % len(fields))
+      return int(val[fields[0]])
 
-      def function(self):
-          return self.resume_func_block.function.print_name
 
-      def line(self):
-          return self.line_info.line
-
-  class StripDecorator(FrameDecorator):
-      def __init__(self, frame):
-          super(StripDecorator, self).__init__(frame)
-          self.frame = frame
-          f = frame.function()
-          self.function_name = f
-
-      def __str__(self, shift = 2):
-          addr = "" if self.address() is None else '%#x' % self.address() + " in "
-          location = "" if self.filename() is None else " at " + self.filename() + ":" + str(self.line())
-          return addr + self.function() + " " + str([str(args) for args in self.frame_args()]) + location
-
-  class CoroutineFilter:
-      def create_coroutine_frames(self, task_addr):
-          frames = []
-          while task_addr != 0:
-              coro_frame = CoroutineFrame(task_addr)
-              frames.append(CoroutineFrameDecorator(coro_frame))
-              task_addr = coro_frame.next_task_addr()
-          return frames
-
-  class AsyncStack(gdb.Command):
+  """
+  Pretty printer for `std::coroutine_handle<T>`
+
+  Works for both libc++ and libstdc++.
+
+  It prints the coroutine handle as a struct with the following fields:
+  - resume: the resume function pointer
+  - destroy: the destroy function pointer
+  - promise: the promise pointer
+  - coro_frame: the coroutine frame pointer
+
+  Most of the functionality is implemented in `DevirtualizedCoroFrame`.
+  """
+  class CoroutineHandlePrinter(DevirtualizedCoroFrame):
+      def __init__(self, val : gdb.Value):
+          frame_ptr_raw = _extract_coro_frame_ptr_from_handle(val)
+          super(CoroutineHandlePrinter, self).__init__(frame_ptr_raw, val)
+
+
+  def build_pretty_printer():
+      pp = gdb.printing.RegexpCollectionPrettyPrinter("coroutine")
+      pp.add_printer('std::coroutine_handle', libcxx_corohdl_regex, CoroutineHandlePrinter)
+      return pp
+
+  gdb.printing.register_pretty_printer(
+      gdb.current_objfile(),
+      build_pretty_printer())
+
+
+  """
+  Get the coroutine frame pointer from a coroutine handle.
+
+  Usage:
+  ```
+  p *get_coro_frame(coroutine_hdl)
+  ```
+  """
+  class GetCoroFrame(gdb.Function):
       def __init__(self):
-          super(AsyncStack, self).__init__("async-bt", gdb.COMMAND_USER)
+          super(GetCoroFrame, self).__init__("get_coro_frame")
 
-      def invoke(self, arg, from_tty):
-          coroutine_filter = CoroutineFilter()
-          argv = gdb.string_to_argv(arg)
-          if len(argv) == 0:
-              try:
-                  task = gdb.parse_and_eval('__coro_frame')
-                  task = int(str(task.address), 16)
-              except Exception:
-                  print ("Can't find __coro_frame in current context.\n" +
-                        "Please use `async-bt` in stackless coroutine context.")
-                  return
-          elif len(argv) != 1:
-              print("usage: async-bt <pointer to task>")
-              return
-          else:
-              task = int(argv[0], 16)
+      def invoke(self, coroutine_hdl_raw):
+          return CoroutineHandlePrinter(coroutine_hdl_raw).frame_ptr
+
+  GetCoroFrame()
 
-          frames = coroutine_filter.create_coroutine_frames(task)
-          i = 0
-          for f in frames:
-              print '#'+ str(i), str(StripDecorator(f))
-              i += 1
-          return
 
-  AsyncStack()
+  """
+  Get the coroutine frame pointer from a coroutine handle.
 
-  class ShowCoroFrame(gdb.Command):
+  Usage:
+  ```
+  p *get_coro_promise(coroutine_hdl)
+  ```
+  """
+  class GetCoroFrame(gdb.Function):
       def __init__(self):
-          super(ShowCoroFrame, self).__init__("show-coro-frame", gdb.COMMAND_USER)
+          super(GetCoroFrame, self).__init__("get_coro_promise")
 
-      def invoke(self, arg, from_tty):
-          argv = gdb.string_to_argv(arg)
-          if len(argv) != 1:
-              print("usage: show-coro-frame <address of coroutine frame>")
-              return
+      def invoke(self, coroutine_hdl_raw):
+          return CoroutineHandlePrinter(coroutine_hdl_raw).promise_ptr
 
-          addr = int(argv[0], 16)
-          block = gdb.block_for_pc(long(cast_addr2long_pointer(addr).dereference()))
-          if block is None:
-              print "block " + str(addr) + " is None."
-              return
+  GetCoroFrame()
+
+
+  """
+  Decorator for coroutine frames.
+
+  Used by `CoroutineFrameFilter` to add the coroutine frames to the built-in `bt` command.
+  """
+  class CoroutineFrameDecorator(FrameDecorator):
+      def __init__(self, coro_frame: DevirtualizedCoroFrame, inferior_frame: gdb.Frame):
+          super(CoroutineFrameDecorator, self).__init__(inferior_frame)
+          self.coro_frame = coro_frame
+
+      def function(self):
+          func_name = self.coro_frame.get_function_name()
+          if func_name is not None:
+              return "[async] " + func_name
+          return "[async] coroutine (coro_frame=" + str(self.coro_frame.frame_ptr_raw) + ")"
+
+      def address(self):
+          return None
+
+      def filename(self):
+          if self.coro_frame.destroy_func is not None:
+              return self.coro_frame.destroy_func.function.symtab.filename
+          return None
+
+      def line(self):
+          if self.coro_frame.resume_label is not None:
+              return self.coro_frame.resume_label.line
+          return None
+
+      def frame_args(self):
+          return []
+
+      def frame_locals(self):
+          return []
+
+
+  def _get_continuation(promise: gdb.Value) -> DevirtualizedCoroFrame | None:
+      try:
+          # TODO: adjust this according for your coroutine framework
+          return DevirtualizedCoroFrame(_extract_coro_frame_ptr_from_handle(promise["continuation"]))
+      except Exception as e:
+          return None
 
-          # Disable demangling since gdb will treat names starting with `_Z`(The marker for Itanium ABI) specially.
-          gdb.execute("set demangle-style none")
 
-          coro_frame_type = gdb.lookup_type(block.function.linkage_name + ".coro_frame_ty")
-          coro_frame_ptr_type = coro_frame_type.pointer()
-          coro_frame = gdb.Value(addr).cast(coro_frame_ptr_type).dereference()
+  def _create_coroutine_frames(coro_frame: DevirtualizedCoroFrame, inferior_frame: gdb.Frame):
+      while coro_frame is not None:
+          yield CoroutineFrameDecorator(coro_frame, inferior_frame)
+          coro_frame = _get_continuation(coro_frame.promise_ptr)
 
-          gdb.execute("set demangle-style auto")
-          gdb.write(coro_frame.format_string(pretty_structs = True))
 
-  ShowCoroFrame()
+  """
+  Frame filter to add coroutine frames to the built-in `bt` command.
+  """
+  class CppCoroutineFrameFilter():
+      def __init__(self):
+          self.name = "CppCoroutineFrameFilter"
+          self.priority = 50
+          self.enabled = True
+          # Register this frame filter with the global frame_filters dictionary.
+          gdb.frame_filters[self.name] = self
+
+      def filter(self, frame_iter: typing.Iterable[gdb.FrameDecorator]):
+          for frame in frame_iter:
+              yield frame
+              inferior_frame = frame.inferior_frame()
+              try:
+                  promise_ptr = inferior_frame.read_var("__promise")
+              except Exception:
+                  continue
+              parent_coro = _get_continuation(promise_ptr)
+              if parent_coro is not None:
+                  yield from _create_coroutine_frames(parent_coro, inferior_frame)
+
+  CppCoroutineFrameFilter()
 
 Further Reading
 ---------------
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index ca7e133..99aa545 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -460,6 +460,7 @@ Bug Fixes to C++ Support
 - Fix a crash when attempting to deduce a deduction guide from a non deducible template template parameter. (#130604)
 - Fix for clang incorrectly rejecting the default construction of a union with
   nontrivial member when another member has an initializer. (#GH81774)
+- Diagnose unresolved overload sets in non-dependent compound requirements. (#GH51246) (#GH97753)
 
 Bug Fixes to AST Handling
 ^^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/clang/include/clang/AST/HLSLResource.h b/clang/include/clang/AST/HLSLResource.h
index 7440050..1be1e42 100644
--- a/clang/include/clang/AST/HLSLResource.h
+++ b/clang/include/clang/AST/HLSLResource.h
@@ -74,6 +74,19 @@ struct ResourceBindingAttrs {
     assert(hasBinding() && !isExplicit() && !hasImplicitOrderID());
     RegBinding->setImplicitBindingOrderID(Value);
   }
+  void setCounterImplicitOrderID(unsigned Value) const {
+    assert(hasBinding() && !hasCounterImplicitOrderID());
+    RegBinding->setImplicitCounterBindingOrderID(Value);
+  }
+
+  bool hasCounterImplicitOrderID() const {
+    return RegBinding && RegBinding->hasImplicitCounterBindingOrderID();
+  }
+
+  unsigned getCounterImplicitOrderID() const {
+    assert(hasCounterImplicitOrderID());
+    return RegBinding->getImplicitCounterBindingOrderID();
+  }
 };
 
 } // namespace hlsl
diff --git a/clang/include/clang/AST/OpenACCClause.h b/clang/include/clang/AST/OpenACCClause.h
index 58ba8d91..79cffeb 100644
--- a/clang/include/clang/AST/OpenACCClause.h
+++ b/clang/include/clang/AST/OpenACCClause.h
@@ -1280,13 +1280,31 @@ public:
 // 'main' declaration used for initializaiton, which is fixed. 
 struct OpenACCReductionRecipe {
   VarDecl *AllocaDecl;
-  // TODO: OpenACC: this should eventually have the operations here too.
 
-  OpenACCReductionRecipe(VarDecl *A) : AllocaDecl(A) {}
+  // A combiner recipe is represented by an operation expression.  However, in
+  // order to generate these properly, we have to make up a LHS and a RHS
+  // expression for the purposes of generation.
+  struct CombinerRecipe {
+    VarDecl *LHS;
+    VarDecl *RHS;
+    Expr *Op;
+  };
+
+  // Contains a collection of the recipe elements we need for the combiner:
+  // -For Scalars, there will be 1 element, just the combiner for that scalar.
+  // -For a struct with a valid operator, this will be 1 element, just that
+  //  call.
+  // -For a struct without the operator, this will be 1 element per field, which
+  //  should be the combiner for that element.
+  // -For an array of any of the above, it will be the above for the element.
+  llvm::SmallVector<CombinerRecipe, 1> CombinerRecipes;
+
+  OpenACCReductionRecipe(VarDecl *A, llvm::ArrayRef<CombinerRecipe> Combiners)
+      : AllocaDecl(A), CombinerRecipes(Combiners) {}
 
   bool isSet() const { return AllocaDecl; }
   static OpenACCReductionRecipe Empty() {
-    return OpenACCReductionRecipe(/*AllocaDecl=*/nullptr);
+    return OpenACCReductionRecipe(/*AllocaDecl=*/nullptr, {});
   }
 };
 
diff --git a/clang/include/clang/ASTMatchers/GtestMatchers.h b/clang/include/clang/ASTMatchers/GtestMatchers.h
deleted file mode 100644
index e19d91a..0000000
--- a/clang/include/clang/ASTMatchers/GtestMatchers.h
+++ /dev/null
@@ -1,87 +0,0 @@
-//===- GtestMatchers.h - AST Matchers for GTest -----------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-//  This file implements matchers specific to structures in the Googletest
-//  (gtest) framework.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_CLANG_ASTMATCHERS_GTESTMATCHERS_H
-#define LLVM_CLANG_ASTMATCHERS_GTESTMATCHERS_H
-
-#include "clang/AST/Stmt.h"
-#include "clang/ASTMatchers/ASTMatchers.h"
-#include "llvm/ADT/StringRef.h"
-
-namespace clang {
-namespace ast_matchers {
-
-/// Gtest's comparison operations.
-enum class GtestCmp {
-  Eq,
-  Ne,
-  Ge,
-  Gt,
-  Le,
-  Lt,
-};
-
-/// This enum indicates whether the mock method in the matched ON_CALL or
-/// EXPECT_CALL macro has arguments. For example, `None` can be used to match
-/// `ON_CALL(mock, TwoParamMethod)` whereas `Some` can be used to match
-/// `ON_CALL(mock, TwoParamMethod(m1, m2))`.
-enum class MockArgs {
-  None,
-  Some,
-};
-
-/// Matcher for gtest's ASSERT comparison macros including ASSERT_EQ, ASSERT_NE,
-/// ASSERT_GE, ASSERT_GT, ASSERT_LE and ASSERT_LT.
-internal::BindableMatcher<Stmt> gtestAssert(GtestCmp Cmp, StatementMatcher Left,
-                                            StatementMatcher Right);
-
-/// Matcher for gtest's ASSERT_THAT macro.
-internal::BindableMatcher<Stmt> gtestAssertThat(StatementMatcher Actual,
-                                                StatementMatcher Matcher);
-
-/// Matcher for gtest's EXPECT comparison macros including EXPECT_EQ, EXPECT_NE,
-/// EXPECT_GE, EXPECT_GT, EXPECT_LE and EXPECT_LT.
-internal::BindableMatcher<Stmt> gtestExpect(GtestCmp Cmp, StatementMatcher Left,
-                                            StatementMatcher Right);
-
-/// Matcher for gtest's EXPECT_THAT macro.
-internal::BindableMatcher<Stmt> gtestExpectThat(StatementMatcher Actual,
-                                                StatementMatcher Matcher);
-
-/// Matcher for gtest's EXPECT_CALL macro. `MockObject` matches the mock
-/// object and `MockMethodName` is the name of the method invoked on the mock
-/// object.
-internal::BindableMatcher<Stmt> gtestExpectCall(StatementMatcher MockObject,
-                                                llvm::StringRef MockMethodName,
-                                                MockArgs Args);
-
-/// Matcher for gtest's EXPECT_CALL macro. `MockCall` matches the whole mock
-/// member method call. This API is more flexible but requires more knowledge of
-/// the AST structure of EXPECT_CALL macros.
-internal::BindableMatcher<Stmt> gtestExpectCall(StatementMatcher MockCall,
-                                                MockArgs Args);
-
-/// Like the first `gtestExpectCall` overload but for `ON_CALL`.
-internal::BindableMatcher<Stmt> gtestOnCall(StatementMatcher MockObject,
-                                            llvm::StringRef MockMethodName,
-                                            MockArgs Args);
-
-/// Like the second `gtestExpectCall` overload but for `ON_CALL`.
-internal::BindableMatcher<Stmt> gtestOnCall(StatementMatcher MockCall,
-                                            MockArgs Args);
-
-} // namespace ast_matchers
-} // namespace clang
-
-#endif // LLVM_CLANG_ASTMATCHERS_GTESTMATCHERS_H
-
diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td
index 3c697ed..3cde249 100644
--- a/clang/include/clang/Basic/Attr.td
+++ b/clang/include/clang/Basic/Attr.td
@@ -4944,6 +4944,7 @@ def HLSLResourceBinding: InheritableAttr {
       std::optional<unsigned> SlotNumber;
       unsigned SpaceNumber;
       std::optional<unsigned> ImplicitBindingOrderID;
+      std::optional<unsigned> ImplicitCounterBindingOrderID;
 
   public:
       void setBinding(RegisterType RT, std::optional<unsigned> SlotNum, unsigned SpaceNum) {
@@ -4976,6 +4977,17 @@ def HLSLResourceBinding: InheritableAttr {
         assert(hasImplicitBindingOrderID() && "attribute does not have implicit binding order id");
         return ImplicitBindingOrderID.value();
       }
+      void setImplicitCounterBindingOrderID(uint32_t Value) {
+        assert(!hasImplicitCounterBindingOrderID() && "attribute already has implicit counter binding order id");
+        ImplicitCounterBindingOrderID = Value;
+      }
+      bool hasImplicitCounterBindingOrderID() const {
+        return ImplicitCounterBindingOrderID.has_value();
+      }
+      uint32_t getImplicitCounterBindingOrderID() const {
+        assert(hasImplicitCounterBindingOrderID() && "attribute does not have implicit counter binding order id");
+        return ImplicitCounterBindingOrderID.value();
+      }
   }];
 }
 
diff --git a/clang/include/clang/Basic/Builtins.td b/clang/include/clang/Basic/Builtins.td
index 468121f..792e2e0 100644
--- a/clang/include/clang/Basic/Builtins.td
+++ b/clang/include/clang/Basic/Builtins.td
@@ -4945,6 +4945,12 @@ def HLSLResourceHandleFromImplicitBinding : LangBuiltin<"HLSL_LANG"> {
   let Prototype = "void(...)";
 }
 
+def HLSLResourceCounterHandleFromImplicitBinding : LangBuiltin<"HLSL_LANG"> {
+  let Spellings = ["__builtin_hlsl_resource_counterhandlefromimplicitbinding"];
+  let Attributes = [NoThrow, CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
 def HLSLResourceNonUniformIndex : LangBuiltin<"HLSL_LANG"> {
   let Spellings = ["__builtin_hlsl_resource_nonuniformindex"];
   let Attributes = [NoThrow];
diff --git a/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h b/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h
index 89b519e..11ad61f 100644
--- a/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h
+++ b/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h
@@ -374,6 +374,12 @@ public:
                         resOperands, attrs);
   }
 
+  cir::CallOp createCallOp(mlir::Location loc, mlir::SymbolRefAttr callee,
+                           mlir::ValueRange operands = mlir::ValueRange(),
+                           llvm::ArrayRef<mlir::NamedAttribute> attrs = {}) {
+    return createCallOp(loc, callee, cir::VoidType(), operands, attrs);
+  }
+
   cir::CallOp createTryCallOp(
       mlir::Location loc, mlir::SymbolRefAttr callee = mlir::SymbolRefAttr(),
       mlir::Type returnType = cir::VoidType(),
diff --git a/clang/include/clang/CIR/MissingFeatures.h b/clang/include/clang/CIR/MissingFeatures.h
index 3b7b130..ace2086 100644
--- a/clang/include/clang/CIR/MissingFeatures.h
+++ b/clang/include/clang/CIR/MissingFeatures.h
@@ -38,9 +38,8 @@ struct MissingFeatures {
   static bool opGlobalPartition() { return false; }
   static bool opGlobalUsedOrCompilerUsed() { return false; }
   static bool opGlobalAnnotations() { return false; }
-  static bool opGlobalDtorLowering() { return false; }
   static bool opGlobalCtorPriority() { return false; }
-  static bool opGlobalCtorList() { return false; }
+  static bool opGlobalDtorList() { return false; }
   static bool setDSOLocal() { return false; }
   static bool setComdat() { return false; }
 
@@ -80,9 +79,12 @@ struct MissingFeatures {
   static bool opFuncExtraAttrs() { return false; }
   static bool opFuncMaybeHandleStaticInExternC() { return false; }
   static bool opFuncMultipleReturnVals() { return false; }
+  static bool opFuncNoUnwind() { return false; }
   static bool opFuncOperandBundles() { return false; }
   static bool opFuncParameterAttributes() { return false; }
+  static bool opFuncReadOnly() { return false; }
   static bool opFuncSection() { return false; }
+  static bool opFuncWillReturn() { return false; }
   static bool setLLVMFunctionFEnvAttributes() { return false; }
   static bool setFunctionAttributes() { return false; }
 
@@ -137,7 +139,6 @@ struct MissingFeatures {
   // RecordType
   static bool skippedLayout() { return false; }
   static bool astRecordDeclAttr() { return false; }
-  static bool recordZeroInitPadding() { return false; }
   static bool zeroSizeRecordMembers() { return false; }
 
   // Coroutines
diff --git a/clang/lib/AST/StmtProfile.cpp b/clang/lib/AST/StmtProfile.cpp
index f3b5478..3cd033e 100644
--- a/clang/lib/AST/StmtProfile.cpp
+++ b/clang/lib/AST/StmtProfile.cpp
@@ -2769,10 +2769,19 @@ void OpenACCClauseProfiler::VisitReductionClause(
 
   for (auto &Recipe : Clause.getRecipes()) {
     Profiler.VisitDecl(Recipe.AllocaDecl);
+
     // TODO: OpenACC: Make sure we remember to update this when we figure out
     // what we're adding for the operation recipe, in the meantime, a static
     // assert will make sure we don't add something.
-    static_assert(sizeof(OpenACCReductionRecipe) == sizeof(int *));
+    static_assert(sizeof(OpenACCReductionRecipe::CombinerRecipe) ==
+                  3 * sizeof(int *));
+    for (auto &CombinerRecipe : Recipe.CombinerRecipes) {
+      if (CombinerRecipe.Op) {
+        Profiler.VisitDecl(CombinerRecipe.LHS);
+        Profiler.VisitDecl(CombinerRecipe.RHS);
+        Profiler.VisitStmt(CombinerRecipe.Op);
+      }
+    }
   }
 }
 
diff --git a/clang/lib/ASTMatchers/CMakeLists.txt b/clang/lib/ASTMatchers/CMakeLists.txt
index 7769fd6..29ad27df 100644
--- a/clang/lib/ASTMatchers/CMakeLists.txt
+++ b/clang/lib/ASTMatchers/CMakeLists.txt
@@ -8,7 +8,6 @@ set(LLVM_LINK_COMPONENTS
 add_clang_library(clangASTMatchers
   ASTMatchFinder.cpp
   ASTMatchersInternal.cpp
-  GtestMatchers.cpp
   LowLevelHelpers.cpp
 
   LINK_LIBS
diff --git a/clang/lib/ASTMatchers/GtestMatchers.cpp b/clang/lib/ASTMatchers/GtestMatchers.cpp
deleted file mode 100644
index 7c135bb..0000000
--- a/clang/lib/ASTMatchers/GtestMatchers.cpp
+++ /dev/null
@@ -1,228 +0,0 @@
-//===- GtestMatchers.cpp - AST Matchers for Gtest ---------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements several matchers for popular gtest macros. In general,
-// AST matchers cannot match calls to macros. However, we can simulate such
-// matches if the macro definition has identifiable elements that themselves can
-// be matched. In that case, we can match on those elements and then check that
-// the match occurs within an expansion of the desired macro. The more uncommon
-// the identified elements, the more efficient this process will be.
-//
-//===----------------------------------------------------------------------===//
-
-#include "clang/ASTMatchers/GtestMatchers.h"
-#include "llvm/ADT/StringRef.h"
-
-namespace clang {
-namespace ast_matchers {
-namespace {
-
-enum class MacroType {
-  Expect,
-  Assert,
-  On,
-};
-
-} // namespace
-
-static DeclarationMatcher getComparisonDecl(GtestCmp Cmp) {
-  switch (Cmp) {
-  case GtestCmp::Eq:
-    return cxxMethodDecl(hasName("Compare"),
-                         ofClass(cxxRecordDecl(isSameOrDerivedFrom(
-                             hasName("::testing::internal::EqHelper")))));
-  case GtestCmp::Ne:
-    return functionDecl(hasName("::testing::internal::CmpHelperNE"));
-  case GtestCmp::Ge:
-    return functionDecl(hasName("::testing::internal::CmpHelperGE"));
-  case GtestCmp::Gt:
-    return functionDecl(hasName("::testing::internal::CmpHelperGT"));
-  case GtestCmp::Le:
-    return functionDecl(hasName("::testing::internal::CmpHelperLE"));
-  case GtestCmp::Lt:
-    return functionDecl(hasName("::testing::internal::CmpHelperLT"));
-  }
-  llvm_unreachable("Unhandled GtestCmp enum");
-}
-
-static llvm::StringRef getMacroTypeName(MacroType Macro) {
-  switch (Macro) {
-  case MacroType::Expect:
-    return "EXPECT";
-  case MacroType::Assert:
-    return "ASSERT";
-  case MacroType::On:
-    return "ON";
-  }
-  llvm_unreachable("Unhandled MacroType enum");
-}
-
-static llvm::StringRef getComparisonTypeName(GtestCmp Cmp) {
-  switch (Cmp) {
-  case GtestCmp::Eq:
-    return "EQ";
-  case GtestCmp::Ne:
-    return "NE";
-  case GtestCmp::Ge:
-    return "GE";
-  case GtestCmp::Gt:
-    return "GT";
-  case GtestCmp::Le:
-    return "LE";
-  case GtestCmp::Lt:
-    return "LT";
-  }
-  llvm_unreachable("Unhandled GtestCmp enum");
-}
-
-static std::string getMacroName(MacroType Macro, GtestCmp Cmp) {
-  return (getMacroTypeName(Macro) + "_" + getComparisonTypeName(Cmp)).str();
-}
-
-static std::string getMacroName(MacroType Macro, llvm::StringRef Operation) {
-  return (getMacroTypeName(Macro) + "_" + Operation).str();
-}
-
-// Under the hood, ON_CALL is expanded to a call to `InternalDefaultActionSetAt`
-// to set a default action spec to the underlying function mocker, while
-// EXPECT_CALL is expanded to a call to `InternalExpectedAt` to set a new
-// expectation spec.
-static llvm::StringRef getSpecSetterName(MacroType Macro) {
-  switch (Macro) {
-  case MacroType::On:
-    return "InternalDefaultActionSetAt";
-  case MacroType::Expect:
-    return "InternalExpectedAt";
-  default:
-    llvm_unreachable("Unhandled MacroType enum");
-  }
-  llvm_unreachable("Unhandled MacroType enum");
-}
-
-// In general, AST matchers cannot match calls to macros. However, we can
-// simulate such matches if the macro definition has identifiable elements that
-// themselves can be matched. In that case, we can match on those elements and
-// then check that the match occurs within an expansion of the desired
-// macro. The more uncommon the identified elements, the more efficient this
-// process will be.
-//
-// We use this approach to implement the derived matchers gtestAssert and
-// gtestExpect.
-static internal::BindableMatcher<Stmt>
-gtestComparisonInternal(MacroType Macro, GtestCmp Cmp, StatementMatcher Left,
-                        StatementMatcher Right) {
-  return callExpr(isExpandedFromMacro(getMacroName(Macro, Cmp)),
-                  callee(getComparisonDecl(Cmp)), hasArgument(2, Left),
-                  hasArgument(3, Right));
-}
-
-static internal::BindableMatcher<Stmt>
-gtestThatInternal(MacroType Macro, StatementMatcher Actual,
-                  StatementMatcher Matcher) {
-  return cxxOperatorCallExpr(
-      isExpandedFromMacro(getMacroName(Macro, "THAT")),
-      hasOverloadedOperatorName("()"), hasArgument(2, Actual),
-      hasArgument(
-          0, expr(hasType(classTemplateSpecializationDecl(hasName(
-                      "::testing::internal::PredicateFormatterFromMatcher"))),
-                  ignoringImplicit(
-                      callExpr(callee(functionDecl(hasName(
-                                   "::testing::internal::"
-                                   "MakePredicateFormatterFromMatcher"))),
-                               hasArgument(0, ignoringImplicit(Matcher)))))));
-}
-
-static internal::BindableMatcher<Stmt>
-gtestCallInternal(MacroType Macro, StatementMatcher MockCall, MockArgs Args) {
-  // A ON_CALL or EXPECT_CALL macro expands to different AST structures
-  // depending on whether the mock method has arguments or not.
-  switch (Args) {
-  // For example,
-  // `ON_CALL(mock, TwoParamMethod)` is expanded to
-  // `mock.gmock_TwoArgsMethod(WithoutMatchers(),
-  // nullptr).InternalDefaultActionSetAt(...)`.
-  // EXPECT_CALL is the same except
-  // that it calls `InternalExpectedAt` instead of `InternalDefaultActionSetAt`
-  // in the end.
-  case MockArgs::None:
-    return cxxMemberCallExpr(
-        isExpandedFromMacro(getMacroName(Macro, "CALL")),
-        callee(functionDecl(hasName(getSpecSetterName(Macro)))),
-        onImplicitObjectArgument(ignoringImplicit(MockCall)));
-  // For example,
-  // `ON_CALL(mock, TwoParamMethod(m1, m2))` is expanded to
-  // `mock.gmock_TwoParamMethod(m1,m2)(WithoutMatchers(),
-  // nullptr).InternalDefaultActionSetAt(...)`.
-  // EXPECT_CALL is the same except that it calls `InternalExpectedAt` instead
-  // of `InternalDefaultActionSetAt` in the end.
-  case MockArgs::Some:
-    return cxxMemberCallExpr(
-        isExpandedFromMacro(getMacroName(Macro, "CALL")),
-        callee(functionDecl(hasName(getSpecSetterName(Macro)))),
-        onImplicitObjectArgument(ignoringImplicit(cxxOperatorCallExpr(
-            hasOverloadedOperatorName("()"), argumentCountIs(3),
-            hasArgument(0, ignoringImplicit(MockCall))))));
-  }
-  llvm_unreachable("Unhandled MockArgs enum");
-}
-
-static internal::BindableMatcher<Stmt>
-gtestCallInternal(MacroType Macro, StatementMatcher MockObject,
-                  llvm::StringRef MockMethodName, MockArgs Args) {
-  return gtestCallInternal(
-      Macro,
-      cxxMemberCallExpr(
-          onImplicitObjectArgument(MockObject),
-          callee(functionDecl(hasName(("gmock_" + MockMethodName).str())))),
-      Args);
-}
-
-internal::BindableMatcher<Stmt> gtestAssert(GtestCmp Cmp, StatementMatcher Left,
-                                            StatementMatcher Right) {
-  return gtestComparisonInternal(MacroType::Assert, Cmp, Left, Right);
-}
-
-internal::BindableMatcher<Stmt> gtestExpect(GtestCmp Cmp, StatementMatcher Left,
-                                            StatementMatcher Right) {
-  return gtestComparisonInternal(MacroType::Expect, Cmp, Left, Right);
-}
-
-internal::BindableMatcher<Stmt> gtestAssertThat(StatementMatcher Actual,
-                                                StatementMatcher Matcher) {
-  return gtestThatInternal(MacroType::Assert, Actual, Matcher);
-}
-
-internal::BindableMatcher<Stmt> gtestExpectThat(StatementMatcher Actual,
-                                                StatementMatcher Matcher) {
-  return gtestThatInternal(MacroType::Expect, Actual, Matcher);
-}
-
-internal::BindableMatcher<Stmt> gtestOnCall(StatementMatcher MockObject,
-                                            llvm::StringRef MockMethodName,
-                                            MockArgs Args) {
-  return gtestCallInternal(MacroType::On, MockObject, MockMethodName, Args);
-}
-
-internal::BindableMatcher<Stmt> gtestOnCall(StatementMatcher MockCall,
-                                            MockArgs Args) {
-  return gtestCallInternal(MacroType::On, MockCall, Args);
-}
-
-internal::BindableMatcher<Stmt> gtestExpectCall(StatementMatcher MockObject,
-                                                llvm::StringRef MockMethodName,
-                                                MockArgs Args) {
-  return gtestCallInternal(MacroType::Expect, MockObject, MockMethodName, Args);
-}
-
-internal::BindableMatcher<Stmt> gtestExpectCall(StatementMatcher MockCall,
-                                                MockArgs Args) {
-  return gtestCallInternal(MacroType::Expect, MockCall, Args);
-}
-
-} // end namespace ast_matchers
-} // end namespace clang
diff --git a/clang/lib/CIR/CodeGen/CIRGenBuilder.h b/clang/lib/CIR/CodeGen/CIRGenBuilder.h
index 58345b4..25afe8b 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuilder.h
+++ b/clang/lib/CIR/CodeGen/CIRGenBuilder.h
@@ -122,6 +122,11 @@ public:
     return getPointerTo(cir::VPtrType::get(getContext()));
   }
 
+  cir::FuncType getFuncType(llvm::ArrayRef<mlir::Type> params, mlir::Type retTy,
+                            bool isVarArg = false) {
+    return cir::FuncType::get(params, retTy, isVarArg);
+  }
+
   /// Get a CIR record kind from a AST declaration tag.
   cir::RecordType::RecordKind getRecordKind(const clang::TagTypeKind kind) {
     switch (kind) {
@@ -372,6 +377,15 @@ public:
     return cir::BinOp::create(*this, loc, cir::BinOpKind::Div, lhs, rhs);
   }
 
+  mlir::Value createDynCast(mlir::Location loc, mlir::Value src,
+                            cir::PointerType destType, bool isRefCast,
+                            cir::DynamicCastInfoAttr info) {
+    auto castKind =
+        isRefCast ? cir::DynamicCastKind::Ref : cir::DynamicCastKind::Ptr;
+    return cir::DynamicCastOp::create(*this, loc, destType, castKind, src, info,
+                                      /*relative_layout=*/false);
+  }
+
   Address createBaseClassAddr(mlir::Location loc, Address addr,
                               mlir::Type destType, unsigned offset,
                               bool assumeNotNull) {
diff --git a/clang/lib/CIR/CodeGen/CIRGenCXX.cpp b/clang/lib/CIR/CodeGen/CIRGenCXX.cpp
index d5b35c2..274d11b 100644
--- a/clang/lib/CIR/CodeGen/CIRGenCXX.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenCXX.cpp
@@ -10,6 +10,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "CIRGenCXXABI.h"
 #include "CIRGenFunction.h"
 #include "CIRGenModule.h"
 
@@ -95,7 +96,63 @@ static void emitDeclDestroy(CIRGenFunction &cgf, const VarDecl *vd,
     return;
   }
 
-  cgf.cgm.errorNYI(vd->getSourceRange(), "global with destructor");
+  // If not constant storage we'll emit this regardless of NeedsDtor value.
+  CIRGenBuilderTy &builder = cgf.getBuilder();
+
+  // Prepare the dtor region.
+  mlir::OpBuilder::InsertionGuard guard(builder);
+  mlir::Block *block = builder.createBlock(&addr.getDtorRegion());
+  CIRGenFunction::LexicalScope lexScope{cgf, addr.getLoc(),
+                                        builder.getInsertionBlock()};
+  lexScope.setAsGlobalInit();
+  builder.setInsertionPointToStart(block);
+
+  CIRGenModule &cgm = cgf.cgm;
+  QualType type = vd->getType();
+
+  // Special-case non-array C++ destructors, if they have the right signature.
+  // Under some ABIs, destructors return this instead of void, and cannot be
+  // passed directly to __cxa_atexit if the target does not allow this
+  // mismatch.
+  const CXXRecordDecl *record = type->getAsCXXRecordDecl();
+  bool canRegisterDestructor =
+      record && (!cgm.getCXXABI().hasThisReturn(
+                     GlobalDecl(record->getDestructor(), Dtor_Complete)) ||
+                 cgm.getCXXABI().canCallMismatchedFunctionType());
+
+  // If __cxa_atexit is disabled via a flag, a different helper function is
+  // generated elsewhere which uses atexit instead, and it takes the destructor
+  // directly.
+  cir::FuncOp fnOp;
+  if (record && (canRegisterDestructor || cgm.getCodeGenOpts().CXAAtExit)) {
+    if (vd->getTLSKind())
+      cgm.errorNYI(vd->getSourceRange(), "TLS destructor");
+    assert(!record->hasTrivialDestructor());
+    assert(!cir::MissingFeatures::openCL());
+    CXXDestructorDecl *dtor = record->getDestructor();
+    // In LLVM OG codegen this is done in registerGlobalDtor, but CIRGen
+    // relies on LoweringPrepare for further decoupling, so build the
+    // call right here.
+    auto gd = GlobalDecl(dtor, Dtor_Complete);
+    fnOp = cgm.getAddrAndTypeOfCXXStructor(gd).second;
+    cgf.getBuilder().createCallOp(
+        cgf.getLoc(vd->getSourceRange()),
+        mlir::FlatSymbolRefAttr::get(fnOp.getSymNameAttr()),
+        mlir::ValueRange{cgm.getAddrOfGlobalVar(vd)});
+  } else {
+    cgm.errorNYI(vd->getSourceRange(), "array destructor");
+  }
+  assert(fnOp && "expected cir.func");
+  cgm.getCXXABI().registerGlobalDtor(vd, fnOp, nullptr);
+
+  builder.setInsertionPointToEnd(block);
+  if (block->empty()) {
+    block->erase();
+    // Don't confuse lexical cleanup.
+    builder.clearInsertionPoint();
+  } else {
+    builder.create<cir::YieldOp>(addr.getLoc());
+  }
 }
 
 cir::FuncOp CIRGenModule::codegenCXXStructor(GlobalDecl gd) {
diff --git a/clang/lib/CIR/CodeGen/CIRGenCXXABI.h b/clang/lib/CIR/CodeGen/CIRGenCXXABI.h
index 2465a68..06f41cd 100644
--- a/clang/lib/CIR/CodeGen/CIRGenCXXABI.h
+++ b/clang/lib/CIR/CodeGen/CIRGenCXXABI.h
@@ -54,6 +54,12 @@ public:
                             Address thisAddr, const CXXRecordDecl *classDecl,
                             const CXXRecordDecl *baseClassDecl) = 0;
 
+  virtual mlir::Value emitDynamicCast(CIRGenFunction &cgf, mlir::Location loc,
+                                      QualType srcRecordTy,
+                                      QualType destRecordTy,
+                                      cir::PointerType destCIRTy,
+                                      bool isRefCast, Address src) = 0;
+
 public:
   /// Similar to AddedStructorArgs, but only notes the number of additional
   /// arguments.
@@ -149,6 +155,14 @@ public:
   /// Loads the incoming C++ this pointer as it was passed by the caller.
   mlir::Value loadIncomingCXXThis(CIRGenFunction &cgf);
 
+  /// Get the implicit (second) parameter that comes after the "this" pointer,
+  /// or nullptr if there is isn't one.
+  virtual mlir::Value getCXXDestructorImplicitParam(CIRGenFunction &cgf,
+                                                    const CXXDestructorDecl *dd,
+                                                    CXXDtorType type,
+                                                    bool forVirtualBase,
+                                                    bool delegating) = 0;
+
   /// Emit constructor variants required by this ABI.
   virtual void emitCXXConstructors(const clang::CXXConstructorDecl *d) = 0;
 
@@ -160,6 +174,14 @@ public:
                                   bool forVirtualBase, bool delegating,
                                   Address thisAddr, QualType thisTy) = 0;
 
+  /// Emit code to force the execution of a destructor during global
+  /// teardown.  The default implementation of this uses atexit.
+  ///
+  /// \param dtor - a function taking a single pointer argument
+  /// \param addr - a pointer to pass to the destructor function.
+  virtual void registerGlobalDtor(const VarDecl *vd, cir::FuncOp dtor,
+                                  mlir::Value addr) = 0;
+
   /// Checks if ABI requires extra virtual offset for vtable field.
   virtual bool
   isVirtualOffsetNeededForVTableField(CIRGenFunction &cgf,
@@ -233,6 +255,16 @@ public:
     return false;
   }
 
+  /// Returns true if the target allows calling a function through a pointer
+  /// with a different signature than the actual function (or equivalently,
+  /// bitcasting a function or function pointer to a different function type).
+  /// In principle in the most general case this could depend on the target, the
+  /// calling convention, and the actual types of the arguments and return
+  /// value. Here it just means whether the signature mismatch could *ever* be
+  /// allowed; in other words, does the target do strict checking of signatures
+  /// for all calls.
+  virtual bool canCallMismatchedFunctionType() const { return true; }
+
   /// Gets the mangle context.
   clang::MangleContext &getMangleContext() { return *mangleContext; }
 
diff --git a/clang/lib/CIR/CodeGen/CIRGenClass.cpp b/clang/lib/CIR/CodeGen/CIRGenClass.cpp
index d9ebf19..485b2c8 100644
--- a/clang/lib/CIR/CodeGen/CIRGenClass.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenClass.cpp
@@ -126,6 +126,30 @@ static bool isInitializerOfDynamicClass(const CXXCtorInitializer *baseInit) {
 }
 
 namespace {
+/// Call the destructor for a direct base class.
+struct CallBaseDtor final : EHScopeStack::Cleanup {
+  const CXXRecordDecl *baseClass;
+  bool baseIsVirtual;
+  CallBaseDtor(const CXXRecordDecl *base, bool baseIsVirtual)
+      : baseClass(base), baseIsVirtual(baseIsVirtual) {}
+
+  void emit(CIRGenFunction &cgf) override {
+    const CXXRecordDecl *derivedClass =
+        cast<CXXMethodDecl>(cgf.curFuncDecl)->getParent();
+
+    const CXXDestructorDecl *d = baseClass->getDestructor();
+    // We are already inside a destructor, so presumably the object being
+    // destroyed should have the expected type.
+    QualType thisTy = d->getFunctionObjectParameterType();
+    assert(cgf.currSrcLoc && "expected source location");
+    Address addr = cgf.getAddressOfDirectBaseInCompleteClass(
+        *cgf.currSrcLoc, cgf.loadCXXThisAddress(), derivedClass, baseClass,
+        baseIsVirtual);
+    cgf.emitCXXDestructorCall(d, Dtor_Base, baseIsVirtual,
+                              /*delegating=*/false, addr, thisTy);
+  }
+};
+
 /// A visitor which checks whether an initializer uses 'this' in a
 /// way which requires the vtable to be properly set.
 struct DynamicThisUseChecker
@@ -891,12 +915,6 @@ public:
     assert(!cir::MissingFeatures::ehCleanupFlags());
     cgf.emitDestroy(lv.getAddress(), field->getType(), destroyer);
   }
-
-  // This is a placeholder until EHCleanupScope is implemented.
-  size_t getSize() const override {
-    assert(!cir::MissingFeatures::ehCleanupScope());
-    return sizeof(DestroyField);
-  }
 };
 } // namespace
 
@@ -928,8 +946,21 @@ void CIRGenFunction::enterDtorCleanups(const CXXDestructorDecl *dd,
   if (dtorType == Dtor_Complete) {
     assert(!cir::MissingFeatures::sanitizers());
 
-    if (classDecl->getNumVBases())
-      cgm.errorNYI(dd->getSourceRange(), "virtual base destructor cleanups");
+    // We push them in the forward order so that they'll be popped in
+    // the reverse order.
+    for (const CXXBaseSpecifier &base : classDecl->vbases()) {
+      auto *baseClassDecl = base.getType()->castAsCXXRecordDecl();
+
+      if (baseClassDecl->hasTrivialDestructor()) {
+        // Under SanitizeMemoryUseAfterDtor, poison the trivial base class
+        // memory. For non-trival base classes the same is done in the class
+        // destructor.
+        assert(!cir::MissingFeatures::sanitizers());
+      } else {
+        ehStack.pushCleanup<CallBaseDtor>(NormalAndEHCleanup, baseClassDecl,
+                                          /*baseIsVirtual=*/true);
+      }
+    }
 
     return;
   }
@@ -948,8 +979,8 @@ void CIRGenFunction::enterDtorCleanups(const CXXDestructorDecl *dd,
     if (baseClassDecl->hasTrivialDestructor())
       assert(!cir::MissingFeatures::sanitizers());
     else
-      cgm.errorNYI(dd->getSourceRange(),
-                   "non-trivial base destructor cleanups");
+      ehStack.pushCleanup<CallBaseDtor>(NormalAndEHCleanup, baseClassDecl,
+                                        /*baseIsVirtual=*/false);
   }
 
   assert(!cir::MissingFeatures::sanitizers());
diff --git a/clang/lib/CIR/CodeGen/CIRGenDecl.cpp b/clang/lib/CIR/CodeGen/CIRGenDecl.cpp
index 563a753..039d290 100644
--- a/clang/lib/CIR/CodeGen/CIRGenDecl.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenDecl.cpp
@@ -695,12 +695,6 @@ struct DestroyObject final : EHScopeStack::Cleanup {
   void emit(CIRGenFunction &cgf) override {
     cgf.emitDestroy(addr, type, destroyer);
   }
-
-  // This is a placeholder until EHCleanupScope is implemented.
-  size_t getSize() const override {
-    assert(!cir::MissingFeatures::ehCleanupScope());
-    return sizeof(DestroyObject);
-  }
 };
 } // namespace
 
diff --git a/clang/lib/CIR/CodeGen/CIRGenExpr.cpp b/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
index be94890..f416571 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
@@ -1185,10 +1185,16 @@ LValue CIRGenFunction::emitCastLValue(const CastExpr *e) {
   case CK_BuiltinFnToFnPtr:
     llvm_unreachable("builtin functions are handled elsewhere");
 
+  case CK_Dynamic: {
+    LValue lv = emitLValue(e->getSubExpr());
+    Address v = lv.getAddress();
+    const auto *dce = cast<CXXDynamicCastExpr>(e);
+    return makeNaturalAlignAddrLValue(emitDynamicCast(v, dce), e->getType());
+  }
+
   // These are never l-values; just use the aggregate emission code.
   case CK_NonAtomicToAtomic:
   case CK_AtomicToNonAtomic:
-  case CK_Dynamic:
   case CK_ToUnion:
   case CK_BaseToDerived:
   case CK_AddressSpaceConversion:
diff --git a/clang/lib/CIR/CodeGen/CIRGenExprCXX.cpp b/clang/lib/CIR/CodeGen/CIRGenExprCXX.cpp
index 4eb8ca8..97c0944 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExprCXX.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExprCXX.cpp
@@ -463,12 +463,6 @@ struct CallObjectDelete final : EHScopeStack::Cleanup {
   void emit(CIRGenFunction &cgf) override {
     cgf.emitDeleteCall(operatorDelete, ptr, elementType);
   }
-
-  // This is a placeholder until EHCleanupScope is implemented.
-  size_t getSize() const override {
-    assert(!cir::MissingFeatures::ehCleanupScope());
-    return sizeof(CallObjectDelete);
-  }
 };
 } // namespace
 
@@ -728,3 +722,43 @@ void CIRGenFunction::emitDeleteCall(const FunctionDecl *deleteFD,
   // Emit the call to delete.
   emitNewDeleteCall(*this, deleteFD, deleteFTy, deleteArgs);
 }
+
+mlir::Value CIRGenFunction::emitDynamicCast(Address thisAddr,
+                                            const CXXDynamicCastExpr *dce) {
+  mlir::Location loc = getLoc(dce->getSourceRange());
+
+  cgm.emitExplicitCastExprType(dce, this);
+  QualType destTy = dce->getTypeAsWritten();
+  QualType srcTy = dce->getSubExpr()->getType();
+
+  // C++ [expr.dynamic.cast]p7:
+  //   If T is "pointer to cv void," then the result is a pointer to the most
+  //   derived object pointed to by v.
+  bool isDynCastToVoid = destTy->isVoidPointerType();
+  bool isRefCast = destTy->isReferenceType();
+
+  QualType srcRecordTy;
+  QualType destRecordTy;
+  if (isDynCastToVoid) {
+    srcRecordTy = srcTy->getPointeeType();
+    // No destRecordTy.
+  } else if (const PointerType *destPTy = destTy->getAs<PointerType>()) {
+    srcRecordTy = srcTy->castAs<PointerType>()->getPointeeType();
+    destRecordTy = destPTy->getPointeeType();
+  } else {
+    srcRecordTy = srcTy;
+    destRecordTy = destTy->castAs<ReferenceType>()->getPointeeType();
+  }
+
+  assert(srcRecordTy->isRecordType() && "source type must be a record type!");
+  assert(!cir::MissingFeatures::emitTypeCheck());
+
+  if (dce->isAlwaysNull()) {
+    cgm.errorNYI(dce->getSourceRange(), "emitDynamicCastToNull");
+    return {};
+  }
+
+  auto destCirTy = mlir::cast<cir::PointerType>(convertType(destTy));
+  return cgm.getCXXABI().emitDynamicCast(*this, loc, srcRecordTy, destRecordTy,
+                                         destCirTy, isRefCast, thisAddr);
+}
diff --git a/clang/lib/CIR/CodeGen/CIRGenExprConstant.cpp b/clang/lib/CIR/CodeGen/CIRGenExprConstant.cpp
index 59aa257..89e9ec4 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExprConstant.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExprConstant.cpp
@@ -500,6 +500,26 @@ private:
   bool appendBitField(const FieldDecl *field, uint64_t fieldOffset,
                       cir::IntAttr ci, bool allowOverwrite = false);
 
+  /// Applies zero-initialization to padding bytes before and within a field.
+  /// \param layout The record layout containing field offset information.
+  /// \param fieldNo The field index in the record.
+  /// \param field The field declaration.
+  /// \param allowOverwrite Whether to allow overwriting existing values.
+  /// \param sizeSoFar The current size processed, updated by this function.
+  /// \param zeroFieldSize Set to true if the field has zero size.
+  /// \returns true on success, false if padding could not be applied.
+  bool applyZeroInitPadding(const ASTRecordLayout &layout, unsigned fieldNo,
+                            const FieldDecl &field, bool allowOverwrite,
+                            CharUnits &sizeSoFar, bool &zeroFieldSize);
+
+  /// Applies zero-initialization to trailing padding bytes in a record.
+  /// \param layout The record layout containing size information.
+  /// \param allowOverwrite Whether to allow overwriting existing values.
+  /// \param sizeSoFar The current size processed.
+  /// \returns true on success, false if padding could not be applied.
+  bool applyZeroInitPadding(const ASTRecordLayout &layout, bool allowOverwrite,
+                            CharUnits &sizeSoFar);
+
   bool build(InitListExpr *ile, bool allowOverwrite);
   bool build(const APValue &val, const RecordDecl *rd, bool isPrimaryBase,
              const CXXRecordDecl *vTableClass, CharUnits baseOffset);
@@ -548,6 +568,49 @@ bool ConstRecordBuilder::appendBitField(const FieldDecl *field,
                          allowOverwrite);
 }
 
+bool ConstRecordBuilder::applyZeroInitPadding(
+    const ASTRecordLayout &layout, unsigned fieldNo, const FieldDecl &field,
+    bool allowOverwrite, CharUnits &sizeSoFar, bool &zeroFieldSize) {
+  uint64_t startBitOffset = layout.getFieldOffset(fieldNo);
+  CharUnits startOffset =
+      cgm.getASTContext().toCharUnitsFromBits(startBitOffset);
+  if (sizeSoFar < startOffset) {
+    if (!appendBytes(sizeSoFar, computePadding(cgm, startOffset - sizeSoFar),
+                     allowOverwrite))
+      return false;
+  }
+
+  if (!field.isBitField()) {
+    CharUnits fieldSize =
+        cgm.getASTContext().getTypeSizeInChars(field.getType());
+    sizeSoFar = startOffset + fieldSize;
+    zeroFieldSize = fieldSize.isZero();
+  } else {
+    const CIRGenRecordLayout &rl =
+        cgm.getTypes().getCIRGenRecordLayout(field.getParent());
+    const CIRGenBitFieldInfo &info = rl.getBitFieldInfo(&field);
+    uint64_t endBitOffset = startBitOffset + info.size;
+    sizeSoFar = cgm.getASTContext().toCharUnitsFromBits(endBitOffset);
+    if (endBitOffset % cgm.getASTContext().getCharWidth() != 0)
+      sizeSoFar++;
+    zeroFieldSize = info.size == 0;
+  }
+  return true;
+}
+
+bool ConstRecordBuilder::applyZeroInitPadding(const ASTRecordLayout &layout,
+                                              bool allowOverwrite,
+                                              CharUnits &sizeSoFar) {
+  CharUnits totalSize = layout.getSize();
+  if (sizeSoFar < totalSize) {
+    if (!appendBytes(sizeSoFar, computePadding(cgm, totalSize - sizeSoFar),
+                     allowOverwrite))
+      return false;
+  }
+  sizeSoFar = totalSize;
+  return true;
+}
+
 bool ConstRecordBuilder::build(InitListExpr *ile, bool allowOverwrite) {
   RecordDecl *rd = ile->getType()
                        ->castAs<clang::RecordType>()
@@ -562,11 +625,9 @@ bool ConstRecordBuilder::build(InitListExpr *ile, bool allowOverwrite) {
     if (cxxrd->getNumBases())
       return false;
 
-  if (cgm.shouldZeroInitPadding()) {
-    assert(!cir::MissingFeatures::recordZeroInitPadding());
-    cgm.errorNYI(rd->getSourceRange(), "zero init padding");
-    return false;
-  }
+  const bool zeroInitPadding = cgm.shouldZeroInitPadding();
+  bool zeroFieldSize = false;
+  CharUnits sizeSoFar = CharUnits::Zero();
 
   unsigned elementNo = 0;
   for (auto [index, field] : llvm::enumerate(rd->fields())) {
@@ -596,7 +657,10 @@ bool ConstRecordBuilder::build(InitListExpr *ile, bool allowOverwrite) {
       continue;
     }
 
-    assert(!cir::MissingFeatures::recordZeroInitPadding());
+    if (zeroInitPadding &&
+        !applyZeroInitPadding(layout, index, *field, allowOverwrite, sizeSoFar,
+                              zeroFieldSize))
+      return false;
 
     // When emitting a DesignatedInitUpdateExpr, a nested InitListExpr
     // represents additional overwriting of our current constant value, and not
@@ -641,8 +705,8 @@ bool ConstRecordBuilder::build(InitListExpr *ile, bool allowOverwrite) {
     }
   }
 
-  assert(!cir::MissingFeatures::recordZeroInitPadding());
-  return true;
+  return !zeroInitPadding ||
+         applyZeroInitPadding(layout, allowOverwrite, sizeSoFar);
 }
 
 namespace {
diff --git a/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp b/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp
index 7edd83e..637f9ef 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp
@@ -1916,6 +1916,11 @@ mlir::Value ScalarExprEmitter::VisitCastExpr(CastExpr *ce) {
     return builder.createIntToPtr(middleVal, destCIRTy);
   }
 
+  case CK_Dynamic: {
+    Address v = cgf.emitPointerWithAlignment(subExpr);
+    const auto *dce = cast<CXXDynamicCastExpr>(ce);
+    return cgf.emitDynamicCast(v, dce);
+  }
   case CK_ArrayToPointerDecay:
     return cgf.emitArrayToPointerDecay(subExpr).getPointer();
 
diff --git a/clang/lib/CIR/CodeGen/CIRGenFunction.h b/clang/lib/CIR/CodeGen/CIRGenFunction.h
index db2adc2..7a606ee 100644
--- a/clang/lib/CIR/CodeGen/CIRGenFunction.h
+++ b/clang/lib/CIR/CodeGen/CIRGenFunction.h
@@ -1312,6 +1312,8 @@ public:
 
   mlir::LogicalResult emitDoStmt(const clang::DoStmt &s);
 
+  mlir::Value emitDynamicCast(Address thisAddr, const CXXDynamicCastExpr *dce);
+
   /// Emit an expression as an initializer for an object (variable, field, etc.)
   /// at the given location.  The expression is not necessarily the normal
   /// initializer for the object, and the address is not necessarily
diff --git a/clang/lib/CIR/CodeGen/CIRGenItaniumCXXABI.cpp b/clang/lib/CIR/CodeGen/CIRGenItaniumCXXABI.cpp
index 0418174..9e490c6d 100644
--- a/clang/lib/CIR/CodeGen/CIRGenItaniumCXXABI.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenItaniumCXXABI.cpp
@@ -59,7 +59,11 @@ public:
 
   void addImplicitStructorParams(CIRGenFunction &cgf, QualType &resTy,
                                  FunctionArgList &params) override;
-
+  mlir::Value getCXXDestructorImplicitParam(CIRGenFunction &cgf,
+                                            const CXXDestructorDecl *dd,
+                                            CXXDtorType type,
+                                            bool forVirtualBase,
+                                            bool delegating) override;
   void emitCXXConstructors(const clang::CXXConstructorDecl *d) override;
   void emitCXXDestructors(const clang::CXXDestructorDecl *d) override;
   void emitCXXStructor(clang::GlobalDecl gd) override;
@@ -68,6 +72,8 @@ public:
                           CXXDtorType type, bool forVirtualBase,
                           bool delegating, Address thisAddr,
                           QualType thisTy) override;
+  void registerGlobalDtor(const VarDecl *vd, cir::FuncOp dtor,
+                          mlir::Value addr) override;
 
   void emitRethrow(CIRGenFunction &cgf, bool isNoReturn) override;
   void emitThrow(CIRGenFunction &cgf, const CXXThrowExpr *e) override;
@@ -116,6 +122,16 @@ public:
                             Address thisAddr, const CXXRecordDecl *classDecl,
                             const CXXRecordDecl *baseClassDecl) override;
 
+  // The traditional clang CodeGen emits calls to `__dynamic_cast` directly into
+  // LLVM in the `emitDynamicCastCall` function. In CIR, `dynamic_cast`
+  // expressions are lowered to `cir.dyn_cast` ops instead of calls to runtime
+  // functions. So during CIRGen we don't need the `emitDynamicCastCall`
+  // function that clang CodeGen has.
+  mlir::Value emitDynamicCast(CIRGenFunction &cgf, mlir::Location loc,
+                              QualType srcRecordTy, QualType destRecordTy,
+                              cir::PointerType destCIRTy, bool isRefCast,
+                              Address src) override;
+
   /**************************** RTTI Uniqueness ******************************/
 protected:
   /// Returns true if the ABI requires RTTI type_info objects to be unique
@@ -1492,11 +1508,8 @@ void CIRGenItaniumCXXABI::emitDestructorCall(
     CIRGenFunction &cgf, const CXXDestructorDecl *dd, CXXDtorType type,
     bool forVirtualBase, bool delegating, Address thisAddr, QualType thisTy) {
   GlobalDecl gd(dd, type);
-  if (needsVTTParameter(gd)) {
-    cgm.errorNYI(dd->getSourceRange(), "emitDestructorCall: VTT");
-  }
-
-  mlir::Value vtt = nullptr;
+  mlir::Value vtt =
+      getCXXDestructorImplicitParam(cgf, dd, type, forVirtualBase, delegating);
   ASTContext &astContext = cgm.getASTContext();
   QualType vttTy = astContext.getPointerType(astContext.VoidPtrTy);
   assert(!cir::MissingFeatures::appleKext());
@@ -1507,6 +1520,34 @@ void CIRGenItaniumCXXABI::emitDestructorCall(
                             vttTy, nullptr);
 }
 
+void CIRGenItaniumCXXABI::registerGlobalDtor(const VarDecl *vd,
+                                             cir::FuncOp dtor,
+                                             mlir::Value addr) {
+  if (vd->isNoDestroy(cgm.getASTContext()))
+    return;
+
+  if (vd->getTLSKind()) {
+    cgm.errorNYI(vd->getSourceRange(), "registerGlobalDtor: TLS");
+    return;
+  }
+
+  // HLSL doesn't support atexit.
+  if (cgm.getLangOpts().HLSL) {
+    cgm.errorNYI(vd->getSourceRange(), "registerGlobalDtor: HLSL");
+    return;
+  }
+
+  // The default behavior is to use atexit. This is handled in lowering
+  // prepare. Nothing to be done for CIR here.
+}
+
+mlir::Value CIRGenItaniumCXXABI::getCXXDestructorImplicitParam(
+    CIRGenFunction &cgf, const CXXDestructorDecl *dd, CXXDtorType type,
+    bool forVirtualBase, bool delegating) {
+  GlobalDecl gd(dd, type);
+  return cgf.getVTTParameter(gd, forVirtualBase, delegating);
+}
+
 // The idea here is creating a separate block for the throw with an
 // `UnreachableOp` as the terminator. So, we branch from the current block
 // to the throw block and create a block for the remaining operations.
@@ -1796,3 +1837,143 @@ mlir::Value CIRGenItaniumCXXABI::getVirtualBaseClassOffset(
   }
   return vbaseOffset;
 }
+
+static cir::FuncOp getBadCastFn(CIRGenFunction &cgf) {
+  // Prototype: void __cxa_bad_cast();
+
+  // TODO(cir): set the calling convention of the runtime function.
+  assert(!cir::MissingFeatures::opFuncCallingConv());
+
+  cir::FuncType fnTy =
+      cgf.getBuilder().getFuncType({}, cgf.getBuilder().getVoidTy());
+  return cgf.cgm.createRuntimeFunction(fnTy, "__cxa_bad_cast");
+}
+
+// TODO(cir): This could be shared with classic codegen.
+static CharUnits computeOffsetHint(ASTContext &astContext,
+                                   const CXXRecordDecl *src,
+                                   const CXXRecordDecl *dst) {
+  CXXBasePaths paths(/*FindAmbiguities=*/true, /*RecordPaths=*/true,
+                     /*DetectVirtual=*/false);
+
+  // If Dst is not derived from Src we can skip the whole computation below and
+  // return that Src is not a public base of Dst.  Record all inheritance paths.
+  if (!dst->isDerivedFrom(src, paths))
+    return CharUnits::fromQuantity(-2ULL);
+
+  unsigned numPublicPaths = 0;
+  CharUnits offset;
+
+  // Now walk all possible inheritance paths.
+  for (const CXXBasePath &path : paths) {
+    if (path.Access != AS_public) // Ignore non-public inheritance.
+      continue;
+
+    ++numPublicPaths;
+
+    for (const CXXBasePathElement &pathElement : path) {
+      // If the path contains a virtual base class we can't give any hint.
+      // -1: no hint.
+      if (pathElement.Base->isVirtual())
+        return CharUnits::fromQuantity(-1ULL);
+
+      if (numPublicPaths > 1) // Won't use offsets, skip computation.
+        continue;
+
+      // Accumulate the base class offsets.
+      const ASTRecordLayout &L =
+          astContext.getASTRecordLayout(pathElement.Class);
+      offset += L.getBaseClassOffset(
+          pathElement.Base->getType()->getAsCXXRecordDecl());
+    }
+  }
+
+  // -2: Src is not a public base of Dst.
+  if (numPublicPaths == 0)
+    return CharUnits::fromQuantity(-2ULL);
+
+  // -3: Src is a multiple public base type but never a virtual base type.
+  if (numPublicPaths > 1)
+    return CharUnits::fromQuantity(-3ULL);
+
+  // Otherwise, the Src type is a unique public nonvirtual base type of Dst.
+  // Return the offset of Src from the origin of Dst.
+  return offset;
+}
+
+static cir::FuncOp getItaniumDynamicCastFn(CIRGenFunction &cgf) {
+  // Prototype:
+  // void *__dynamic_cast(const void *sub,
+  //                      global_as const abi::__class_type_info *src,
+  //                      global_as const abi::__class_type_info *dst,
+  //                      std::ptrdiff_t src2dst_offset);
+
+  mlir::Type voidPtrTy = cgf.getBuilder().getVoidPtrTy();
+  mlir::Type rttiPtrTy = cgf.getBuilder().getUInt8PtrTy();
+  mlir::Type ptrDiffTy = cgf.convertType(cgf.getContext().getPointerDiffType());
+
+  // TODO(cir): mark the function as nowind willreturn readonly.
+  assert(!cir::MissingFeatures::opFuncNoUnwind());
+  assert(!cir::MissingFeatures::opFuncWillReturn());
+  assert(!cir::MissingFeatures::opFuncReadOnly());
+
+  // TODO(cir): set the calling convention of the runtime function.
+  assert(!cir::MissingFeatures::opFuncCallingConv());
+
+  cir::FuncType FTy = cgf.getBuilder().getFuncType(
+      {voidPtrTy, rttiPtrTy, rttiPtrTy, ptrDiffTy}, voidPtrTy);
+  return cgf.cgm.createRuntimeFunction(FTy, "__dynamic_cast");
+}
+
+static cir::DynamicCastInfoAttr emitDynamicCastInfo(CIRGenFunction &cgf,
+                                                    mlir::Location loc,
+                                                    QualType srcRecordTy,
+                                                    QualType destRecordTy) {
+  auto srcRtti = mlir::cast<cir::GlobalViewAttr>(
+      cgf.cgm.getAddrOfRTTIDescriptor(loc, srcRecordTy));
+  auto destRtti = mlir::cast<cir::GlobalViewAttr>(
+      cgf.cgm.getAddrOfRTTIDescriptor(loc, destRecordTy));
+
+  cir::FuncOp runtimeFuncOp = getItaniumDynamicCastFn(cgf);
+  cir::FuncOp badCastFuncOp = getBadCastFn(cgf);
+  auto runtimeFuncRef = mlir::FlatSymbolRefAttr::get(runtimeFuncOp);
+  auto badCastFuncRef = mlir::FlatSymbolRefAttr::get(badCastFuncOp);
+
+  const CXXRecordDecl *srcDecl = srcRecordTy->getAsCXXRecordDecl();
+  const CXXRecordDecl *destDecl = destRecordTy->getAsCXXRecordDecl();
+  CharUnits offsetHint = computeOffsetHint(cgf.getContext(), srcDecl, destDecl);
+
+  mlir::Type ptrdiffTy = cgf.convertType(cgf.getContext().getPointerDiffType());
+  auto offsetHintAttr = cir::IntAttr::get(ptrdiffTy, offsetHint.getQuantity());
+
+  return cir::DynamicCastInfoAttr::get(srcRtti, destRtti, runtimeFuncRef,
+                                       badCastFuncRef, offsetHintAttr);
+}
+
+mlir::Value CIRGenItaniumCXXABI::emitDynamicCast(CIRGenFunction &cgf,
+                                                 mlir::Location loc,
+                                                 QualType srcRecordTy,
+                                                 QualType destRecordTy,
+                                                 cir::PointerType destCIRTy,
+                                                 bool isRefCast, Address src) {
+  bool isCastToVoid = destRecordTy.isNull();
+  assert((!isCastToVoid || !isRefCast) && "cannot cast to void reference");
+
+  if (isCastToVoid) {
+    cgm.errorNYI(loc, "emitDynamicCastToVoid");
+    return {};
+  }
+
+  // If the destination is effectively final, the cast succeeds if and only
+  // if the dynamic type of the pointer is exactly the destination type.
+  if (destRecordTy->getAsCXXRecordDecl()->isEffectivelyFinal() &&
+      cgf.cgm.getCodeGenOpts().OptimizationLevel > 0) {
+    cgm.errorNYI(loc, "emitExactDynamicCast");
+    return {};
+  }
+
+  cir::DynamicCastInfoAttr castInfo =
+      emitDynamicCastInfo(cgf, loc, srcRecordTy, destRecordTy);
+  return cgf.getBuilder().createDynCast(loc, src.getPointer(), destCIRTy,
+                                        isRefCast, castInfo);
+}
diff --git a/clang/lib/CIR/CodeGen/CIRGenModule.cpp b/clang/lib/CIR/CodeGen/CIRGenModule.cpp
index 910c8a9..fe1ea56 100644
--- a/clang/lib/CIR/CodeGen/CIRGenModule.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenModule.cpp
@@ -2079,6 +2079,29 @@ CIRGenModule::createCIRBuiltinFunction(mlir::Location loc, StringRef name,
   return fnOp;
 }
 
+cir::FuncOp CIRGenModule::createRuntimeFunction(cir::FuncType ty,
+                                                StringRef name, mlir::ArrayAttr,
+                                                [[maybe_unused]] bool isLocal,
+                                                bool assumeConvergent) {
+  if (assumeConvergent)
+    errorNYI("createRuntimeFunction: assumeConvergent");
+  if (isLocal)
+    errorNYI("createRuntimeFunction: local");
+
+  cir::FuncOp entry = getOrCreateCIRFunction(name, ty, GlobalDecl(),
+                                             /*forVtable=*/false);
+
+  if (entry) {
+    // TODO(cir): set the attributes of the function.
+    assert(!cir::MissingFeatures::setLLVMFunctionFEnvAttributes());
+    assert(!cir::MissingFeatures::opFuncCallingConv());
+    assert(!cir::MissingFeatures::opGlobalDLLImportExport());
+    entry.setDSOLocal(true);
+  }
+
+  return entry;
+}
+
 mlir::SymbolTable::Visibility
 CIRGenModule::getMLIRVisibility(cir::GlobalOp op) {
   // MLIR doesn't accept public symbols declarations (only
diff --git a/clang/lib/CIR/CodeGen/CIRGenModule.h b/clang/lib/CIR/CodeGen/CIRGenModule.h
index c6a6681..f627bae 100644
--- a/clang/lib/CIR/CodeGen/CIRGenModule.h
+++ b/clang/lib/CIR/CodeGen/CIRGenModule.h
@@ -480,6 +480,10 @@ public:
                                        cir::FuncType ty,
                                        const clang::FunctionDecl *fd);
 
+  cir::FuncOp createRuntimeFunction(cir::FuncType ty, llvm::StringRef name,
+                                    mlir::ArrayAttr = {}, bool isLocal = false,
+                                    bool assumeConvergent = false);
+
   static constexpr const char *builtinCoroId = "__builtin_coro_id";
 
   /// Given a builtin id for a function like "__builtin_fabsf", return a
diff --git a/clang/lib/CIR/CodeGen/CIRGenVTables.cpp b/clang/lib/CIR/CodeGen/CIRGenVTables.cpp
index 94d856b..84f5977 100644
--- a/clang/lib/CIR/CodeGen/CIRGenVTables.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenVTables.cpp
@@ -327,9 +327,40 @@ cir::GlobalLinkageKind CIRGenModule::getVTableLinkage(const CXXRecordDecl *rd) {
       llvm_unreachable("Should not have been asked to emit this");
     }
   }
+  // -fapple-kext mode does not support weak linkage, so we must use
+  // internal linkage.
+  if (astContext.getLangOpts().AppleKext)
+    return cir::GlobalLinkageKind::InternalLinkage;
+
+  auto discardableODRLinkage = cir::GlobalLinkageKind::LinkOnceODRLinkage;
+  auto nonDiscardableODRLinkage = cir::GlobalLinkageKind::WeakODRLinkage;
+  if (rd->hasAttr<DLLExportAttr>()) {
+    // Cannot discard exported vtables.
+    discardableODRLinkage = nonDiscardableODRLinkage;
+  } else if (rd->hasAttr<DLLImportAttr>()) {
+    // Imported vtables are available externally.
+    discardableODRLinkage = cir::GlobalLinkageKind::AvailableExternallyLinkage;
+    nonDiscardableODRLinkage =
+        cir::GlobalLinkageKind::AvailableExternallyLinkage;
+  }
+
+  switch (rd->getTemplateSpecializationKind()) {
+  case TSK_Undeclared:
+  case TSK_ExplicitSpecialization:
+  case TSK_ImplicitInstantiation:
+    return discardableODRLinkage;
+
+  case TSK_ExplicitInstantiationDeclaration: {
+    errorNYI(rd->getSourceRange(),
+             "getVTableLinkage: explicit instantiation declaration");
+    return cir::GlobalLinkageKind::ExternalLinkage;
+  }
+
+  case TSK_ExplicitInstantiationDefinition:
+    return nonDiscardableODRLinkage;
+  }
 
-  errorNYI(rd->getSourceRange(), "getVTableLinkage: no key function");
-  return cir::GlobalLinkageKind::ExternalLinkage;
+  llvm_unreachable("Invalid TemplateSpecializationKind!");
 }
 
 cir::GlobalOp CIRGenVTables::getAddrOfVTT(const CXXRecordDecl *rd) {
diff --git a/clang/lib/CIR/CodeGen/EHScopeStack.h b/clang/lib/CIR/CodeGen/EHScopeStack.h
index 66c1f76..67a72f5 100644
--- a/clang/lib/CIR/CodeGen/EHScopeStack.h
+++ b/clang/lib/CIR/CodeGen/EHScopeStack.h
@@ -108,9 +108,6 @@ public:
     ///
     // \param flags cleanup kind.
     virtual void emit(CIRGenFunction &cgf) = 0;
-
-    // This is a placeholder until EHScope is implemented.
-    virtual size_t getSize() const = 0;
   };
 
 private:
diff --git a/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp b/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp
index bc917d0..706e54f 100644
--- a/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp
+++ b/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp
@@ -41,6 +41,16 @@ static SmallString<128> getTransformedFileName(mlir::ModuleOp mlirModule) {
   return fileName;
 }
 
+/// Return the FuncOp called by `callOp`.
+static cir::FuncOp getCalledFunction(cir::CallOp callOp) {
+  mlir::SymbolRefAttr sym = llvm::dyn_cast_if_present<mlir::SymbolRefAttr>(
+      callOp.getCallableForCallee());
+  if (!sym)
+    return nullptr;
+  return dyn_cast_or_null<cir::FuncOp>(
+      mlir::SymbolTable::lookupNearestSymbolFrom(callOp, sym));
+}
+
 namespace {
 struct LoweringPreparePass : public LoweringPrepareBase<LoweringPreparePass> {
   LoweringPreparePass() = default;
@@ -69,6 +79,12 @@ struct LoweringPreparePass : public LoweringPrepareBase<LoweringPreparePass> {
       cir::FuncType type,
       cir::GlobalLinkageKind linkage = cir::GlobalLinkageKind::ExternalLinkage);
 
+  cir::GlobalOp buildRuntimeVariable(
+      mlir::OpBuilder &builder, llvm::StringRef name, mlir::Location loc,
+      mlir::Type type,
+      cir::GlobalLinkageKind linkage = cir::GlobalLinkageKind::ExternalLinkage,
+      cir::VisibilityKind visibility = cir::VisibilityKind::Default);
+
   ///
   /// AST related
   /// -----------
@@ -90,6 +106,25 @@ struct LoweringPreparePass : public LoweringPrepareBase<LoweringPreparePass> {
 
 } // namespace
 
+cir::GlobalOp LoweringPreparePass::buildRuntimeVariable(
+    mlir::OpBuilder &builder, llvm::StringRef name, mlir::Location loc,
+    mlir::Type type, cir::GlobalLinkageKind linkage,
+    cir::VisibilityKind visibility) {
+  cir::GlobalOp g = dyn_cast_or_null<cir::GlobalOp>(
+      mlir::SymbolTable::lookupNearestSymbolFrom(
+          mlirModule, mlir::StringAttr::get(mlirModule->getContext(), name)));
+  if (!g) {
+    g = cir::GlobalOp::create(builder, loc, name, type);
+    g.setLinkageAttr(
+        cir::GlobalLinkageKindAttr::get(builder.getContext(), linkage));
+    mlir::SymbolTable::setSymbolVisibility(
+        g, mlir::SymbolTable::Visibility::Private);
+    g.setGlobalVisibilityAttr(
+        cir::VisibilityAttr::get(builder.getContext(), visibility));
+  }
+  return g;
+}
+
 cir::FuncOp LoweringPreparePass::buildRuntimeFunction(
     mlir::OpBuilder &builder, llvm::StringRef name, mlir::Location loc,
     cir::FuncType type, cir::GlobalLinkageKind linkage) {
@@ -640,7 +675,8 @@ LoweringPreparePass::buildCXXGlobalVarDeclInitFunc(cir::GlobalOp op) {
   // Create a variable initialization function.
   CIRBaseBuilderTy builder(getContext());
   builder.setInsertionPointAfter(op);
-  auto fnType = cir::FuncType::get({}, builder.getVoidTy());
+  cir::VoidType voidTy = builder.getVoidTy();
+  auto fnType = cir::FuncType::get({}, voidTy);
   FuncOp f = buildRuntimeFunction(builder, fnName, op.getLoc(), fnType,
                                   cir::GlobalLinkageKind::InternalLinkage);
 
@@ -655,8 +691,57 @@ LoweringPreparePass::buildCXXGlobalVarDeclInitFunc(cir::GlobalOp op) {
   // Register the destructor call with __cxa_atexit
   mlir::Region &dtorRegion = op.getDtorRegion();
   if (!dtorRegion.empty()) {
-    assert(!cir::MissingFeatures::opGlobalDtorLowering());
-    llvm_unreachable("dtor region lowering is NYI");
+    assert(!cir::MissingFeatures::astVarDeclInterface());
+    assert(!cir::MissingFeatures::opGlobalThreadLocal());
+    // Create a variable that binds the atexit to this shared object.
+    builder.setInsertionPointToStart(&mlirModule.getBodyRegion().front());
+    cir::GlobalOp handle = buildRuntimeVariable(
+        builder, "__dso_handle", op.getLoc(), builder.getI8Type(),
+        cir::GlobalLinkageKind::ExternalLinkage, cir::VisibilityKind::Hidden);
+
+    // Look for the destructor call in dtorBlock
+    mlir::Block &dtorBlock = dtorRegion.front();
+    cir::CallOp dtorCall;
+    for (auto op : reverse(dtorBlock.getOps<cir::CallOp>())) {
+      dtorCall = op;
+      break;
+    }
+    assert(dtorCall && "Expected a dtor call");
+    cir::FuncOp dtorFunc = getCalledFunction(dtorCall);
+    assert(dtorFunc && "Expected a dtor call");
+
+    // Create a runtime helper function:
+    //    extern "C" int __cxa_atexit(void (*f)(void *), void *p, void *d);
+    auto voidPtrTy = cir::PointerType::get(voidTy);
+    auto voidFnTy = cir::FuncType::get({voidPtrTy}, voidTy);
+    auto voidFnPtrTy = cir::PointerType::get(voidFnTy);
+    auto handlePtrTy = cir::PointerType::get(handle.getSymType());
+    auto fnAtExitType =
+        cir::FuncType::get({voidFnPtrTy, voidPtrTy, handlePtrTy}, voidTy);
+    const char *nameAtExit = "__cxa_atexit";
+    cir::FuncOp fnAtExit =
+        buildRuntimeFunction(builder, nameAtExit, op.getLoc(), fnAtExitType);
+
+    // Replace the dtor call with a call to __cxa_atexit(&dtor, &var,
+    // &__dso_handle)
+    builder.setInsertionPointAfter(dtorCall);
+    mlir::Value args[3];
+    auto dtorPtrTy = cir::PointerType::get(dtorFunc.getFunctionType());
+    // dtorPtrTy
+    args[0] = cir::GetGlobalOp::create(builder, dtorCall.getLoc(), dtorPtrTy,
+                                       dtorFunc.getSymName());
+    args[0] = cir::CastOp::create(builder, dtorCall.getLoc(), voidFnPtrTy,
+                                  cir::CastKind::bitcast, args[0]);
+    args[1] =
+        cir::CastOp::create(builder, dtorCall.getLoc(), voidPtrTy,
+                            cir::CastKind::bitcast, dtorCall.getArgOperand(0));
+    args[2] = cir::GetGlobalOp::create(builder, handle.getLoc(), handlePtrTy,
+                                       handle.getSymName());
+    builder.createCallOp(dtorCall.getLoc(), fnAtExit, args);
+    dtorCall->erase();
+    entryBB->getOperations().splice(entryBB->end(), dtorBlock.getOperations(),
+                                    dtorBlock.begin(),
+                                    std::prev(dtorBlock.end()));
   }
 
   // Replace cir.yield with cir.return
@@ -666,11 +751,12 @@ LoweringPreparePass::buildCXXGlobalVarDeclInitFunc(cir::GlobalOp op) {
     mlir::Block &block = op.getCtorRegion().front();
     yieldOp = &block.getOperations().back();
   } else {
-    assert(!cir::MissingFeatures::opGlobalDtorLowering());
-    llvm_unreachable("dtor region lowering is NYI");
+    assert(!dtorRegion.empty());
+    mlir::Block &block = dtorRegion.front();
+    yieldOp = &block.getOperations().back();
   }
 
-  assert(isa<YieldOp>(*yieldOp));
+  assert(isa<cir::YieldOp>(*yieldOp));
   cir::ReturnOp::create(builder, yieldOp->getLoc());
   return f;
 }
@@ -715,7 +801,10 @@ void LoweringPreparePass::buildGlobalCtorDtorList() {
                         mlir::ArrayAttr::get(&getContext(), globalCtors));
   }
 
-  assert(!cir::MissingFeatures::opGlobalDtorLowering());
+  // We will eventual need to populate a global_dtor list, but that's not
+  // needed for globals with destructors. It will only be needed for functions
+  // that are marked as global destructors with an attribute.
+  assert(!cir::MissingFeatures::opGlobalDtorList());
 }
 
 void LoweringPreparePass::buildCXXGlobalInitFunc() {
diff --git a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
index a80a295..a1ecfc7 100644
--- a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
+++ b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
@@ -1771,9 +1771,13 @@ mlir::LogicalResult CIRToLLVMGlobalOpLowering::matchAndRewrite(
   }
 
   // Rewrite op.
-  rewriter.replaceOpWithNewOp<mlir::LLVM::GlobalOp>(
+  auto newOp = rewriter.replaceOpWithNewOp<mlir::LLVM::GlobalOp>(
       op, llvmType, isConst, linkage, symbol, init.value_or(mlir::Attribute()),
       alignment, addrSpace, isDsoLocal, isThreadLocal, comdatAttr, attributes);
+  newOp.setVisibility_Attr(mlir::LLVM::VisibilityAttr::get(
+      getContext(), lowerCIRVisibilityToLLVMVisibility(
+                        op.getGlobalVisibilityAttr().getValue())));
+
   return mlir::success();
 }
 
@@ -2594,6 +2598,7 @@ void ConvertCIRToLLVMPass::runOnOperation() {
                       return std::make_pair(ctorAttr.getName(),
                                             ctorAttr.getPriority());
                     });
+  assert(!cir::MissingFeatures::opGlobalDtorList());
 }
 
 mlir::LogicalResult CIRToLLVMBrOpLowering::matchAndRewrite(
diff --git a/clang/lib/CodeGen/CGHLSLBuiltins.cpp b/clang/lib/CodeGen/CGHLSLBuiltins.cpp
index 6c0fc8d..4f2f5a76 100644
--- a/clang/lib/CodeGen/CGHLSLBuiltins.cpp
+++ b/clang/lib/CodeGen/CGHLSLBuiltins.cpp
@@ -352,6 +352,19 @@ Value *CodeGenFunction::EmitHLSLBuiltinExpr(unsigned BuiltinID,
     SmallVector<Value *> Args{OrderID, SpaceOp, RangeOp, IndexOp, Name};
     return Builder.CreateIntrinsic(HandleTy, IntrinsicID, Args);
   }
+  case Builtin::BI__builtin_hlsl_resource_counterhandlefromimplicitbinding: {
+    Value *MainHandle = EmitScalarExpr(E->getArg(0));
+    if (!CGM.getTriple().isSPIRV())
+      return MainHandle;
+
+    llvm::Type *HandleTy = CGM.getTypes().ConvertType(E->getType());
+    Value *OrderID = EmitScalarExpr(E->getArg(1));
+    Value *SpaceOp = EmitScalarExpr(E->getArg(2));
+    llvm::Intrinsic::ID IntrinsicID =
+        llvm::Intrinsic::spv_resource_counterhandlefromimplicitbinding;
+    SmallVector<Value *> Args{MainHandle, OrderID, SpaceOp};
+    return Builder.CreateIntrinsic(HandleTy, IntrinsicID, Args);
+  }
   case Builtin::BI__builtin_hlsl_resource_nonuniformindex: {
     Value *IndexOp = EmitScalarExpr(E->getArg(0));
     llvm::Type *RetTy = ConvertType(E->getType());
diff --git a/clang/lib/CodeGen/CGHLSLRuntime.cpp b/clang/lib/CodeGen/CGHLSLRuntime.cpp
index ede1780..603cef9 100644
--- a/clang/lib/CodeGen/CGHLSLRuntime.cpp
+++ b/clang/lib/CodeGen/CGHLSLRuntime.cpp
@@ -145,19 +145,29 @@ static CXXMethodDecl *lookupResourceInitMethodAndSetupArgs(
     // explicit binding
     auto *RegSlot = llvm::ConstantInt::get(CGM.IntTy, Binding.getSlot());
     Args.add(RValue::get(RegSlot), AST.UnsignedIntTy);
-    CreateMethod = lookupMethod(ResourceDecl, "__createFromBinding", SC_Static);
+    const char *Name = Binding.hasCounterImplicitOrderID()
+                           ? "__createFromBindingWithImplicitCounter"
+                           : "__createFromBinding";
+    CreateMethod = lookupMethod(ResourceDecl, Name, SC_Static);
   } else {
     // implicit binding
     auto *OrderID =
         llvm::ConstantInt::get(CGM.IntTy, Binding.getImplicitOrderID());
     Args.add(RValue::get(OrderID), AST.UnsignedIntTy);
-    CreateMethod =
-        lookupMethod(ResourceDecl, "__createFromImplicitBinding", SC_Static);
+    const char *Name = Binding.hasCounterImplicitOrderID()
+                           ? "__createFromImplicitBindingWithImplicitCounter"
+                           : "__createFromImplicitBinding";
+    CreateMethod = lookupMethod(ResourceDecl, Name, SC_Static);
   }
   Args.add(RValue::get(Space), AST.UnsignedIntTy);
   Args.add(RValue::get(Range), AST.IntTy);
   Args.add(RValue::get(Index), AST.UnsignedIntTy);
   Args.add(RValue::get(NameStr), AST.getPointerType(AST.CharTy.withConst()));
+  if (Binding.hasCounterImplicitOrderID()) {
+    uint32_t CounterBinding = Binding.getCounterImplicitOrderID();
+    auto *CounterOrderID = llvm::ConstantInt::get(CGM.IntTy, CounterBinding);
+    Args.add(RValue::get(CounterOrderID), AST.UnsignedIntTy);
+  }
 
   return CreateMethod;
 }
diff --git a/clang/lib/Parse/ParseExprCXX.cpp b/clang/lib/Parse/ParseExprCXX.cpp
index a2c6957..90191b0 100644
--- a/clang/lib/Parse/ParseExprCXX.cpp
+++ b/clang/lib/Parse/ParseExprCXX.cpp
@@ -3200,6 +3200,8 @@ ExprResult Parser::ParseRequiresExpression() {
         BalancedDelimiterTracker ExprBraces(*this, tok::l_brace);
         ExprBraces.consumeOpen();
         ExprResult Expression = ParseExpression();
+        if (Expression.isUsable())
+          Expression = Actions.CheckPlaceholderExpr(Expression.get());
         if (!Expression.isUsable()) {
           ExprBraces.skipToEnd();
           SkipUntil(tok::semi, tok::r_brace, SkipUntilFlags::StopBeforeMatch);
@@ -3369,6 +3371,8 @@ ExprResult Parser::ParseRequiresExpression() {
         //         expression ';'
         SourceLocation StartLoc = Tok.getLocation();
         ExprResult Expression = ParseExpression();
+        if (Expression.isUsable())
+          Expression = Actions.CheckPlaceholderExpr(Expression.get());
         if (!Expression.isUsable()) {
           SkipUntil(tok::semi, tok::r_brace, SkipUntilFlags::StopBeforeMatch);
           break;
diff --git a/clang/lib/Sema/HLSLBuiltinTypeDeclBuilder.cpp b/clang/lib/Sema/HLSLBuiltinTypeDeclBuilder.cpp
index 3c20ccd..40c318a 100644
--- a/clang/lib/Sema/HLSLBuiltinTypeDeclBuilder.cpp
+++ b/clang/lib/Sema/HLSLBuiltinTypeDeclBuilder.cpp
@@ -144,6 +144,7 @@ private:
     _2,
     _3,
     _4,
+    _5,
     Handle = 128,
     CounterHandle,
     LastStmt
@@ -190,6 +191,9 @@ public:
   template <typename T>
   BuiltinTypeMethodBuilder &
   accessCounterHandleFieldOnResource(T ResourceRecord);
+  template <typename ResourceT, typename ValueT>
+  BuiltinTypeMethodBuilder &
+  setCounterHandleFieldOnResource(ResourceT ResourceRecord, ValueT HandleValue);
   template <typename T> BuiltinTypeMethodBuilder &returnValue(T ReturnValue);
   BuiltinTypeMethodBuilder &returnThis();
   BuiltinTypeDeclBuilder &finalize();
@@ -205,6 +209,11 @@ private:
     if (!Method)
       createDecl();
   }
+
+  template <typename ResourceT, typename ValueT>
+  BuiltinTypeMethodBuilder &setFieldOnResource(ResourceT ResourceRecord,
+                                               ValueT HandleValue,
+                                               FieldDecl *HandleField);
 };
 
 TemplateParameterListBuilder::~TemplateParameterListBuilder() {
@@ -592,13 +601,27 @@ template <typename ResourceT, typename ValueT>
 BuiltinTypeMethodBuilder &
 BuiltinTypeMethodBuilder::setHandleFieldOnResource(ResourceT ResourceRecord,
                                                    ValueT HandleValue) {
+  return setFieldOnResource(ResourceRecord, HandleValue,
+                            DeclBuilder.getResourceHandleField());
+}
+
+template <typename ResourceT, typename ValueT>
+BuiltinTypeMethodBuilder &
+BuiltinTypeMethodBuilder::setCounterHandleFieldOnResource(
+    ResourceT ResourceRecord, ValueT HandleValue) {
+  return setFieldOnResource(ResourceRecord, HandleValue,
+                            DeclBuilder.getResourceCounterHandleField());
+}
+
+template <typename ResourceT, typename ValueT>
+BuiltinTypeMethodBuilder &BuiltinTypeMethodBuilder::setFieldOnResource(
+    ResourceT ResourceRecord, ValueT HandleValue, FieldDecl *HandleField) {
   ensureCompleteDecl();
 
   Expr *ResourceExpr = convertPlaceholder(ResourceRecord);
   Expr *HandleValueExpr = convertPlaceholder(HandleValue);
 
   ASTContext &AST = DeclBuilder.SemaRef.getASTContext();
-  FieldDecl *HandleField = DeclBuilder.getResourceHandleField();
   MemberExpr *HandleMemberExpr = MemberExpr::CreateImplicit(
       AST, ResourceExpr, false, HandleField, HandleField->getType(), VK_LValue,
       OK_Ordinary);
@@ -829,6 +852,18 @@ BuiltinTypeDeclBuilder &BuiltinTypeDeclBuilder::addDefaultHandleConstructor() {
       .finalize();
 }
 
+BuiltinTypeDeclBuilder &
+BuiltinTypeDeclBuilder::addStaticInitializationFunctions(bool HasCounter) {
+  if (HasCounter) {
+    addCreateFromBindingWithImplicitCounter();
+    addCreateFromImplicitBindingWithImplicitCounter();
+  } else {
+    addCreateFromBinding();
+    addCreateFromImplicitBinding();
+  }
+  return *this;
+}
+
 // Adds static method that initializes resource from binding:
 //
 // static Resource<T> __createFromBinding(unsigned registerNo,
@@ -903,6 +938,102 @@ BuiltinTypeDeclBuilder &BuiltinTypeDeclBuilder::addCreateFromImplicitBinding() {
       .finalize();
 }
 
+// Adds static method that initializes resource from binding:
+//
+// static Resource<T>
+// __createFromBindingWithImplicitCounter(unsigned registerNo,
+//                                        unsigned spaceNo, int range,
+//                                        unsigned index, const char *name,
+//                                        unsigned counterOrderId) {
+//   Resource<T> tmp;
+//   tmp.__handle = __builtin_hlsl_resource_handlefrombinding(
+//       tmp.__handle, registerNo, spaceNo, range, index, name);
+//   tmp.__counter_handle =
+//       __builtin_hlsl_resource_counterhandlefromimplicitbinding(
+//           tmp.__handle, counterOrderId, spaceNo);
+//   return tmp;
+// }
+BuiltinTypeDeclBuilder &
+BuiltinTypeDeclBuilder::addCreateFromBindingWithImplicitCounter() {
+  assert(!Record->isCompleteDefinition() && "record is already complete");
+
+  using PH = BuiltinTypeMethodBuilder::PlaceHolder;
+  ASTContext &AST = SemaRef.getASTContext();
+  QualType HandleType = getResourceHandleField()->getType();
+  QualType RecordType = AST.getTypeDeclType(cast<TypeDecl>(Record));
+  BuiltinTypeMethodBuilder::LocalVar TmpVar("tmp", RecordType);
+
+  return BuiltinTypeMethodBuilder(*this,
+                                  "__createFromBindingWithImplicitCounter",
+                                  RecordType, false, false, SC_Static)
+      .addParam("registerNo", AST.UnsignedIntTy)
+      .addParam("spaceNo", AST.UnsignedIntTy)
+      .addParam("range", AST.IntTy)
+      .addParam("index", AST.UnsignedIntTy)
+      .addParam("name", AST.getPointerType(AST.CharTy.withConst()))
+      .addParam("counterOrderId", AST.UnsignedIntTy)
+      .declareLocalVar(TmpVar)
+      .accessHandleFieldOnResource(TmpVar)
+      .callBuiltin("__builtin_hlsl_resource_handlefrombinding", HandleType,
+                   PH::LastStmt, PH::_0, PH::_1, PH::_2, PH::_3, PH::_4)
+      .setHandleFieldOnResource(TmpVar, PH::LastStmt)
+      .accessHandleFieldOnResource(TmpVar)
+      .callBuiltin("__builtin_hlsl_resource_counterhandlefromimplicitbinding",
+                   HandleType, PH::LastStmt, PH::_5, PH::_1)
+      .setCounterHandleFieldOnResource(TmpVar, PH::LastStmt)
+      .returnValue(TmpVar)
+      .finalize();
+}
+
+// Adds static method that initializes resource from binding:
+//
+// static Resource<T>
+// __createFromImplicitBindingWithImplicitCounter(unsigned orderId,
+//                                                unsigned spaceNo, int range,
+//                                                unsigned index,
+//                                                const char *name,
+//                                                unsigned counterOrderId) {
+//   Resource<T> tmp;
+//   tmp.__handle = __builtin_hlsl_resource_handlefromimplicitbinding(
+//       tmp.__handle, orderId, spaceNo, range, index, name);
+//   tmp.__counter_handle =
+//       __builtin_hlsl_resource_counterhandlefromimplicitbinding(
+//           tmp.__handle, counterOrderId, spaceNo);
+//   return tmp;
+// }
+BuiltinTypeDeclBuilder &
+BuiltinTypeDeclBuilder::addCreateFromImplicitBindingWithImplicitCounter() {
+  assert(!Record->isCompleteDefinition() && "record is already complete");
+
+  using PH = BuiltinTypeMethodBuilder::PlaceHolder;
+  ASTContext &AST = SemaRef.getASTContext();
+  QualType HandleType = getResourceHandleField()->getType();
+  QualType RecordType = AST.getTypeDeclType(cast<TypeDecl>(Record));
+  BuiltinTypeMethodBuilder::LocalVar TmpVar("tmp", RecordType);
+
+  return BuiltinTypeMethodBuilder(
+             *this, "__createFromImplicitBindingWithImplicitCounter",
+             RecordType, false, false, SC_Static)
+      .addParam("orderId", AST.UnsignedIntTy)
+      .addParam("spaceNo", AST.UnsignedIntTy)
+      .addParam("range", AST.IntTy)
+      .addParam("index", AST.UnsignedIntTy)
+      .addParam("name", AST.getPointerType(AST.CharTy.withConst()))
+      .addParam("counterOrderId", AST.UnsignedIntTy)
+      .declareLocalVar(TmpVar)
+      .accessHandleFieldOnResource(TmpVar)
+      .callBuiltin("__builtin_hlsl_resource_handlefromimplicitbinding",
+                   HandleType, PH::LastStmt, PH::_0, PH::_1, PH::_2, PH::_3,
+                   PH::_4)
+      .setHandleFieldOnResource(TmpVar, PH::LastStmt)
+      .accessHandleFieldOnResource(TmpVar)
+      .callBuiltin("__builtin_hlsl_resource_counterhandlefromimplicitbinding",
+                   HandleType, PH::LastStmt, PH::_5, PH::_1)
+      .setCounterHandleFieldOnResource(TmpVar, PH::LastStmt)
+      .returnValue(TmpVar)
+      .finalize();
+}
+
 BuiltinTypeDeclBuilder &BuiltinTypeDeclBuilder::addCopyConstructor() {
   assert(!Record->isCompleteDefinition() && "record is already complete");
 
@@ -1048,7 +1179,7 @@ BuiltinTypeDeclBuilder &BuiltinTypeDeclBuilder::addIncrementCounterMethod() {
   return BuiltinTypeMethodBuilder(*this, "IncrementCounter",
                                   SemaRef.getASTContext().UnsignedIntTy)
       .callBuiltin("__builtin_hlsl_buffer_update_counter", QualType(),
-                   PH::Handle, getConstantIntExpr(1))
+                   PH::CounterHandle, getConstantIntExpr(1))
       .finalize();
 }
 
@@ -1057,7 +1188,7 @@ BuiltinTypeDeclBuilder &BuiltinTypeDeclBuilder::addDecrementCounterMethod() {
   return BuiltinTypeMethodBuilder(*this, "DecrementCounter",
                                   SemaRef.getASTContext().UnsignedIntTy)
       .callBuiltin("__builtin_hlsl_buffer_update_counter", QualType(),
-                   PH::Handle, getConstantIntExpr(-1))
+                   PH::CounterHandle, getConstantIntExpr(-1))
       .finalize();
 }
 
@@ -1102,7 +1233,7 @@ BuiltinTypeDeclBuilder &BuiltinTypeDeclBuilder::addAppendMethod() {
   return BuiltinTypeMethodBuilder(*this, "Append", AST.VoidTy)
       .addParam("value", ElemTy)
       .callBuiltin("__builtin_hlsl_buffer_update_counter", AST.UnsignedIntTy,
-                   PH::Handle, getConstantIntExpr(1))
+                   PH::CounterHandle, getConstantIntExpr(1))
       .callBuiltin("__builtin_hlsl_resource_getpointer",
                    AST.getPointerType(AddrSpaceElemTy), PH::Handle,
                    PH::LastStmt)
@@ -1119,7 +1250,7 @@ BuiltinTypeDeclBuilder &BuiltinTypeDeclBuilder::addConsumeMethod() {
       AST.getAddrSpaceQualType(ElemTy, LangAS::hlsl_device);
   return BuiltinTypeMethodBuilder(*this, "Consume", ElemTy)
       .callBuiltin("__builtin_hlsl_buffer_update_counter", AST.UnsignedIntTy,
-                   PH::Handle, getConstantIntExpr(-1))
+                   PH::CounterHandle, getConstantIntExpr(-1))
       .callBuiltin("__builtin_hlsl_resource_getpointer",
                    AST.getPointerType(AddrSpaceElemTy), PH::Handle,
                    PH::LastStmt)
diff --git a/clang/lib/Sema/HLSLBuiltinTypeDeclBuilder.h b/clang/lib/Sema/HLSLBuiltinTypeDeclBuilder.h
index a981602..86cbd10 100644
--- a/clang/lib/Sema/HLSLBuiltinTypeDeclBuilder.h
+++ b/clang/lib/Sema/HLSLBuiltinTypeDeclBuilder.h
@@ -83,8 +83,7 @@ public:
   BuiltinTypeDeclBuilder &addCopyAssignmentOperator();
 
   // Static create methods
-  BuiltinTypeDeclBuilder &addCreateFromBinding();
-  BuiltinTypeDeclBuilder &addCreateFromImplicitBinding();
+  BuiltinTypeDeclBuilder &addStaticInitializationFunctions(bool HasCounter);
 
   // Builtin types methods
   BuiltinTypeDeclBuilder &addLoadMethods();
@@ -96,6 +95,10 @@ public:
   BuiltinTypeDeclBuilder &addConsumeMethod();
 
 private:
+  BuiltinTypeDeclBuilder &addCreateFromBinding();
+  BuiltinTypeDeclBuilder &addCreateFromImplicitBinding();
+  BuiltinTypeDeclBuilder &addCreateFromBindingWithImplicitCounter();
+  BuiltinTypeDeclBuilder &addCreateFromImplicitBindingWithImplicitCounter();
   BuiltinTypeDeclBuilder &addResourceMember(StringRef MemberName,
                                             ResourceClass RC, bool IsROV,
                                             bool RawBuffer, bool IsCounter,
diff --git a/clang/lib/Sema/HLSLExternalSemaSource.cpp b/clang/lib/Sema/HLSLExternalSemaSource.cpp
index cc43e94..e118dda 100644
--- a/clang/lib/Sema/HLSLExternalSemaSource.cpp
+++ b/clang/lib/Sema/HLSLExternalSemaSource.cpp
@@ -236,8 +236,7 @@ static BuiltinTypeDeclBuilder setupBufferType(CXXRecordDecl *Decl, Sema &S,
       .addDefaultHandleConstructor()
       .addCopyConstructor()
       .addCopyAssignmentOperator()
-      .addCreateFromBinding()
-      .addCreateFromImplicitBinding();
+      .addStaticInitializationFunctions(HasCounter);
 }
 
 // This function is responsible for constructing the constraint expression for
diff --git a/clang/lib/Sema/SemaHLSL.cpp b/clang/lib/Sema/SemaHLSL.cpp
index 09e5d69..17cb1e4 100644
--- a/clang/lib/Sema/SemaHLSL.cpp
+++ b/clang/lib/Sema/SemaHLSL.cpp
@@ -1240,6 +1240,20 @@ static CXXMethodDecl *lookupMethod(Sema &S, CXXRecordDecl *RecordDecl,
 
 } // end anonymous namespace
 
+static bool hasCounterHandle(const CXXRecordDecl *RD) {
+  if (RD->field_empty())
+    return false;
+  auto It = std::next(RD->field_begin());
+  if (It == RD->field_end())
+    return false;
+  const FieldDecl *SecondField = *It;
+  if (const auto *ResTy =
+          SecondField->getType()->getAs<HLSLAttributedResourceType>()) {
+    return ResTy->getAttrs().IsCounter;
+  }
+  return false;
+}
+
 bool SemaHLSL::handleRootSignatureElements(
     ArrayRef<hlsl::RootSignatureElement> Elements) {
   // Define some common error handling functions
@@ -2973,6 +2987,25 @@ bool SemaHLSL::CheckBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall) {
     TheCall->setType(ResourceTy);
     break;
   }
+  case Builtin::BI__builtin_hlsl_resource_counterhandlefromimplicitbinding: {
+    ASTContext &AST = SemaRef.getASTContext();
+    if (SemaRef.checkArgCount(TheCall, 3) ||
+        CheckResourceHandle(&SemaRef, TheCall, 0) ||
+        CheckArgTypeMatches(&SemaRef, TheCall->getArg(1), AST.UnsignedIntTy) ||
+        CheckArgTypeMatches(&SemaRef, TheCall->getArg(2), AST.UnsignedIntTy))
+      return true;
+
+    QualType MainHandleTy = TheCall->getArg(0)->getType();
+    auto *MainResType = MainHandleTy->getAs<HLSLAttributedResourceType>();
+    auto MainAttrs = MainResType->getAttrs();
+    assert(!MainAttrs.IsCounter && "cannot create a counter from a counter");
+    MainAttrs.IsCounter = true;
+    QualType CounterHandleTy = AST.getHLSLAttributedResourceType(
+        MainResType->getWrappedType(), MainResType->getContainedType(),
+        MainAttrs);
+    TheCall->setType(CounterHandleTy);
+    break;
+  }
   case Builtin::BI__builtin_hlsl_and:
   case Builtin::BI__builtin_hlsl_or: {
     if (SemaRef.checkArgCount(TheCall, 2))
@@ -3780,10 +3813,24 @@ void SemaHLSL::ActOnVariableDeclarator(VarDecl *VD) {
         uint32_t OrderID = getNextImplicitBindingOrderID();
         if (Binding.hasBinding())
           Binding.setImplicitOrderID(OrderID);
-        else
+        else {
           addImplicitBindingAttrToDecl(
               SemaRef, VD, getRegisterType(getResourceArrayHandleType(VD)),
               OrderID);
+          // Re-create the binding object to pick up the new attribute.
+          Binding = ResourceBindingAttrs(VD);
+        }
+      }
+
+      // Get to the base type of a potentially multi-dimensional array.
+      QualType Ty = getASTContext().getBaseElementType(VD->getType());
+
+      const CXXRecordDecl *RD = Ty->getAsCXXRecordDecl();
+      if (hasCounterHandle(RD)) {
+        if (!Binding.hasCounterImplicitOrderID()) {
+          uint32_t OrderID = getNextImplicitBindingOrderID();
+          Binding.setCounterImplicitOrderID(OrderID);
+        }
       }
     }
   }
@@ -3808,19 +3855,31 @@ bool SemaHLSL::initGlobalResourceDecl(VarDecl *VD) {
   CXXMethodDecl *CreateMethod = nullptr;
   llvm::SmallVector<Expr *> Args;
 
+  bool HasCounter = hasCounterHandle(ResourceDecl);
+  const char *CreateMethodName;
+  if (Binding.isExplicit())
+    CreateMethodName = HasCounter ? "__createFromBindingWithImplicitCounter"
+                                  : "__createFromBinding";
+  else
+    CreateMethodName = HasCounter
+                           ? "__createFromImplicitBindingWithImplicitCounter"
+                           : "__createFromImplicitBinding";
+
+  CreateMethod =
+      lookupMethod(SemaRef, ResourceDecl, CreateMethodName, VD->getLocation());
+
+  if (!CreateMethod)
+    // This can happen if someone creates a struct that looks like an HLSL
+    // resource record but does not have the required static create method.
+    // No binding will be generated for it.
+    return false;
+
   if (Binding.isExplicit()) {
-    // The resource has explicit binding.
-    CreateMethod = lookupMethod(SemaRef, ResourceDecl, "__createFromBinding",
-                                VD->getLocation());
     IntegerLiteral *RegSlot =
         IntegerLiteral::Create(AST, llvm::APInt(UIntTySize, Binding.getSlot()),
                                AST.UnsignedIntTy, SourceLocation());
     Args.push_back(RegSlot);
   } else {
-    // The resource has implicit binding.
-    CreateMethod =
-        lookupMethod(SemaRef, ResourceDecl, "__createFromImplicitBinding",
-                     VD->getLocation());
     uint32_t OrderID = (Binding.hasImplicitOrderID())
                            ? Binding.getImplicitOrderID()
                            : getNextImplicitBindingOrderID();
@@ -3830,12 +3889,6 @@ bool SemaHLSL::initGlobalResourceDecl(VarDecl *VD) {
     Args.push_back(OrderId);
   }
 
-  if (!CreateMethod)
-    // This can happen if someone creates a struct that looks like an HLSL
-    // resource record but does not have the required static create method.
-    // No binding will be generated for it.
-    return false;
-
   IntegerLiteral *Space =
       IntegerLiteral::Create(AST, llvm::APInt(UIntTySize, Binding.getSpace()),
                              AST.UnsignedIntTy, SourceLocation());
@@ -3859,6 +3912,15 @@ bool SemaHLSL::initGlobalResourceDecl(VarDecl *VD) {
       Name, nullptr, VK_PRValue, FPOptionsOverride());
   Args.push_back(NameCast);
 
+  if (HasCounter) {
+    // Will this be in the correct order?
+    uint32_t CounterOrderID = getNextImplicitBindingOrderID();
+    IntegerLiteral *CounterId =
+        IntegerLiteral::Create(AST, llvm::APInt(UIntTySize, CounterOrderID),
+                               AST.UnsignedIntTy, SourceLocation());
+    Args.push_back(CounterId);
+  }
+
   // Make sure the create method template is instantiated and emitted.
   if (!CreateMethod->isDefined() && CreateMethod->isTemplateInstantiation())
     SemaRef.InstantiateFunctionDefinition(VD->getLocation(), CreateMethod,
@@ -3899,20 +3961,24 @@ bool SemaHLSL::initGlobalResourceArrayDecl(VarDecl *VD) {
   ASTContext &AST = SemaRef.getASTContext();
   QualType ResElementTy = AST.getBaseElementType(VD->getType());
   CXXRecordDecl *ResourceDecl = ResElementTy->getAsCXXRecordDecl();
-
-  HLSLResourceBindingAttr *RBA = VD->getAttr<HLSLResourceBindingAttr>();
-  HLSLVkBindingAttr *VkBinding = VD->getAttr<HLSLVkBindingAttr>();
   CXXMethodDecl *CreateMethod = nullptr;
 
-  if (VkBinding || (RBA && RBA->hasRegisterSlot()))
+  bool HasCounter = hasCounterHandle(ResourceDecl);
+  ResourceBindingAttrs ResourceAttrs(VD);
+  if (ResourceAttrs.isExplicit())
     // Resource has explicit binding.
-    CreateMethod = lookupMethod(SemaRef, ResourceDecl, "__createFromBinding",
-                                VD->getLocation());
-  else
-    // Resource has implicit binding.
     CreateMethod =
-        lookupMethod(SemaRef, ResourceDecl, "__createFromImplicitBinding",
+        lookupMethod(SemaRef, ResourceDecl,
+                     HasCounter ? "__createFromBindingWithImplicitCounter"
+                                : "__createFromBinding",
                      VD->getLocation());
+  else
+    // Resource has implicit binding.
+    CreateMethod = lookupMethod(
+        SemaRef, ResourceDecl,
+        HasCounter ? "__createFromImplicitBindingWithImplicitCounter"
+                   : "__createFromImplicitBinding",
+        VD->getLocation());
 
   if (!CreateMethod)
     return false;
diff --git a/clang/lib/Sema/SemaOpenACC.cpp b/clang/lib/Sema/SemaOpenACC.cpp
index 8471f02..4824b5a 100644
--- a/clang/lib/Sema/SemaOpenACC.cpp
+++ b/clang/lib/Sema/SemaOpenACC.cpp
@@ -2946,5 +2946,5 @@ OpenACCReductionRecipe SemaOpenACC::CreateReductionInitRecipe(
     AllocaDecl->setInit(Init.get());
     AllocaDecl->setInitStyle(VarDecl::CallInit);
   }
-  return OpenACCReductionRecipe(AllocaDecl);
+  return OpenACCReductionRecipe(AllocaDecl, {});
 }
diff --git a/clang/lib/Serialization/ASTReader.cpp b/clang/lib/Serialization/ASTReader.cpp
index 6acf79a..868f0cc 100644
--- a/clang/lib/Serialization/ASTReader.cpp
+++ b/clang/lib/Serialization/ASTReader.cpp
@@ -13009,9 +13009,22 @@ OpenACCClause *ASTRecordReader::readOpenACCClause() {
     llvm::SmallVector<OpenACCReductionRecipe> RecipeList;
 
     for (unsigned I = 0; I < VarList.size(); ++I) {
-      static_assert(sizeof(OpenACCReductionRecipe) == sizeof(int *));
       VarDecl *Recipe = readDeclAs<VarDecl>();
-      RecipeList.push_back({Recipe});
+
+      static_assert(sizeof(OpenACCReductionRecipe::CombinerRecipe) ==
+                    3 * sizeof(int *));
+
+      llvm::SmallVector<OpenACCReductionRecipe::CombinerRecipe> Combiners;
+      unsigned NumCombiners = readInt();
+      for (unsigned I = 0; I < NumCombiners; ++I) {
+        VarDecl *LHS = readDeclAs<VarDecl>();
+        VarDecl *RHS = readDeclAs<VarDecl>();
+        Expr *Op = readExpr();
+
+        Combiners.push_back({LHS, RHS, Op});
+      }
+
+      RecipeList.push_back({Recipe, Combiners});
     }
 
     return OpenACCReductionClause::Create(getContext(), BeginLoc, LParenLoc, Op,
diff --git a/clang/lib/Serialization/ASTWriter.cpp b/clang/lib/Serialization/ASTWriter.cpp
index 09b1e58..82ccde8 100644
--- a/clang/lib/Serialization/ASTWriter.cpp
+++ b/clang/lib/Serialization/ASTWriter.cpp
@@ -8925,8 +8925,17 @@ void ASTRecordWriter::writeOpenACCClause(const OpenACCClause *C) {
     writeOpenACCVarList(RC);
 
     for (const OpenACCReductionRecipe &R : RC->getRecipes()) {
-      static_assert(sizeof(OpenACCReductionRecipe) == 1 * sizeof(int *));
       AddDeclRef(R.AllocaDecl);
+
+      static_assert(sizeof(OpenACCReductionRecipe::CombinerRecipe) ==
+                    3 * sizeof(int *));
+      writeUInt32(R.CombinerRecipes.size());
+
+      for (auto &CombinerRecipe : R.CombinerRecipes) {
+        AddDeclRef(CombinerRecipe.LHS);
+        AddDeclRef(CombinerRecipe.RHS);
+        AddStmt(CombinerRecipe.Op);
+      }
     }
     return;
   }
diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/RawPtrRefMemberChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/WebKit/RawPtrRefMemberChecker.cpp
index 15a0c5a..ace639c 100644
--- a/clang/lib/StaticAnalyzer/Checkers/WebKit/RawPtrRefMemberChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/RawPtrRefMemberChecker.cpp
@@ -232,7 +232,7 @@ public:
     bool ignoreARC =
         !PD->isReadOnly() && PD->getSetterKind() == ObjCPropertyDecl::Assign;
     auto IsUnsafePtr = isUnsafePtr(QT, ignoreARC);
-    return {IsUnsafePtr && *IsUnsafePtr, PropType};
+    return {IsUnsafePtr && *IsUnsafePtr && !PD->isRetaining(), PropType};
   }
 
   bool shouldSkipDecl(const RecordDecl *RD) const {
diff --git a/clang/test/AST/ByteCode/libcxx/deref-to-array.cpp b/clang/test/AST/ByteCode/libcxx/deref-to-array.cpp
index 2a527ab..7cfcd76 100644
--- a/clang/test/AST/ByteCode/libcxx/deref-to-array.cpp
+++ b/clang/test/AST/ByteCode/libcxx/deref-to-array.cpp
@@ -270,7 +270,7 @@ template <class _Op, class _Yp>
 void __is_derived_from_view_interface();
 template <class _Tp>
 bool enable_view = derived_from<_Tp, int> ||
-                   requires { ranges::__is_derived_from_view_interface; };
+                   requires { ranges::__is_derived_from_view_interface<_Tp, int>(); };
 template <class>
 concept range = requires { ranges::end; };
 template <class _Tp>
diff --git a/clang/test/AST/HLSL/StructuredBuffers-AST.hlsl b/clang/test/AST/HLSL/StructuredBuffers-AST.hlsl
index 6779abb..e72207e 100644
--- a/clang/test/AST/HLSL/StructuredBuffers-AST.hlsl
+++ b/clang/test/AST/HLSL/StructuredBuffers-AST.hlsl
@@ -4,7 +4,7 @@
 //
 // RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library -x hlsl -ast-dump \
 // RUN:   -DRESOURCE=StructuredBuffer %s | FileCheck -DRESOURCE=StructuredBuffer \
-// RUN:   -check-prefixes=CHECK,CHECK-SRV,CHECK-SUBSCRIPT,CHECK-LOAD %s
+// RUN:   -check-prefixes=CHECK,CHECK-SRV,CHECK-SUBSCRIPT,CHECK-LOAD,CHECK-BINDING %s
 //
 // RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library -x hlsl -ast-dump -DEMPTY \
 // RUN:  -DRESOURCE=RWStructuredBuffer %s | FileCheck -DRESOURCE=RWStructuredBuffer \
@@ -141,59 +141,133 @@ RESOURCE<float> Buffer;
 
 // Static __createFromBinding method
 
-// CHECK: CXXMethodDecl {{.*}} __createFromBinding 'hlsl::[[RESOURCE]]<element_type> (unsigned int, unsigned int, int, unsigned int, const char *)' static
-// CHECK-NEXT: ParmVarDecl {{.*}} registerNo 'unsigned int'
-// CHECK-NEXT: ParmVarDecl {{.*}} spaceNo 'unsigned int'
-// CHECK-NEXT: ParmVarDecl {{.*}} range 'int'
-// CHECK-NEXT: ParmVarDecl {{.*}} index 'unsigned int'
-// CHECK-NEXT: ParmVarDecl {{.*}} name 'const char *'
-// CHECK-NEXT: CompoundStmt
-// CHECK-NEXT: DeclStmt
-// CHECK-NEXT: VarDecl {{.*}} tmp 'hlsl::[[RESOURCE]]<element_type>'
-// CHECK-NEXT: BinaryOperator {{.*}} '__hlsl_resource_t {{.*}}]]' '='
-// CHECK-NEXT: MemberExpr {{.*}} '__hlsl_resource_t {{.*}}' lvalue .__handle
-// CHECK-NEXT: DeclRefExpr {{.*}} 'hlsl::[[RESOURCE]]<element_type>' lvalue Var {{.*}} 'tmp' 'hlsl::[[RESOURCE]]<element_type>'
-// CHECK-NEXT: CallExpr {{.*}} '__hlsl_resource_t {{.*}}'
-// CHECK-NEXT: ImplicitCastExpr {{.*}} 'void (*)(...) noexcept' <BuiltinFnToFnPtr>
-// CHECK-NEXT: DeclRefExpr {{.*}} '<builtin fn type>' Function {{.*}} '__builtin_hlsl_resource_handlefrombinding' 'void (...) noexcept'
-// CHECK-NEXT: MemberExpr {{.*}} '__hlsl_resource_t {{.*}}' lvalue .__handle
-// CHECK-NEXT: DeclRefExpr {{.*}} 'hlsl::[[RESOURCE]]<element_type>' lvalue Var {{.*}} 'tmp' 'hlsl::[[RESOURCE]]<element_type>'
-// CHECK-NEXT: DeclRefExpr {{.*}} 'unsigned int' ParmVar {{.*}} 'registerNo' 'unsigned int'
-// CHECK-NEXT: DeclRefExpr {{.*}} 'unsigned int' ParmVar {{.*}} 'spaceNo' 'unsigned int'
-// CHECK-NEXT: DeclRefExpr {{.*}} 'int' ParmVar {{.*}} 'range' 'int'
-// CHECK-NEXT: DeclRefExpr {{.*}} 'unsigned int' ParmVar {{.*}} 'index' 'unsigned int'
-// CHECK-NEXT: DeclRefExpr {{.*}} 'const char *' ParmVar {{.*}} 'name' 'const char *'
-// CHECK-NEXT: ReturnStmt
-// CHECK-NEXT: DeclRefExpr {{.*}} 'hlsl::[[RESOURCE]]<element_type>' lvalue Var {{.*}} 'tmp' 'hlsl::[[RESOURCE]]<element_type>'
-// CHECK-NEXT: AlwaysInlineAttr {{.*}} Implicit always_inline
+// CHECK-BINDING: CXXMethodDecl {{.*}} __createFromBinding 'hlsl::[[RESOURCE]]<element_type> (unsigned int, unsigned int, int, unsigned int, const char *)' static
+// CHECK-BINDING-NEXT: ParmVarDecl {{.*}} registerNo 'unsigned int'
+// CHECK-BINDING-NEXT: ParmVarDecl {{.*}} spaceNo 'unsigned int'
+// CHECK-BINDING-NEXT: ParmVarDecl {{.*}} range 'int'
+// CHECK-BINDING-NEXT: ParmVarDecl {{.*}} index 'unsigned int'
+// CHECK-BINDING-NEXT: ParmVarDecl {{.*}} name 'const char *'
+// CHECK-BINDING-NEXT: CompoundStmt
+// CHECK-BINDING-NEXT: DeclStmt
+// CHECK-BINDING-NEXT: VarDecl {{.*}} tmp 'hlsl::[[RESOURCE]]<element_type>'
+// CHECK-BINDING-NEXT: BinaryOperator {{.*}} '__hlsl_resource_t {{.*}}]]' '='
+// CHECK-BINDING-NEXT: MemberExpr {{.*}} '__hlsl_resource_t {{.*}}' lvalue .__handle
+// CHECK-BINDING-NEXT: DeclRefExpr {{.*}} 'hlsl::[[RESOURCE]]<element_type>' lvalue Var {{.*}} 'tmp' 'hlsl::[[RESOURCE]]<element_type>'
+// CHECK-BINDING-NEXT: CallExpr {{.*}} '__hlsl_resource_t {{.*}}'
+// CHECK-BINDING-NEXT: ImplicitCastExpr {{.*}} 'void (*)(...) noexcept' <BuiltinFnToFnPtr>
+// CHECK-BINDING-NEXT: DeclRefExpr {{.*}} '<builtin fn type>' Function {{.*}} '__builtin_hlsl_resource_handlefrombinding' 'void (...) noexcept'
+// CHECK-BINDING-NEXT: MemberExpr {{.*}} '__hlsl_resource_t {{.*}}' lvalue .__handle
+// CHECK-BINDING-NEXT: DeclRefExpr {{.*}} 'hlsl::[[RESOURCE]]<element_type>' lvalue Var {{.*}} 'tmp' 'hlsl::[[RESOURCE]]<element_type>'
+// CHECK-BINDING-NEXT: DeclRefExpr {{.*}} 'unsigned int' ParmVar {{.*}} 'registerNo' 'unsigned int'
+// CHECK-BINDING-NEXT: DeclRefExpr {{.*}} 'unsigned int' ParmVar {{.*}} 'spaceNo' 'unsigned int'
+// CHECK-BINDING-NEXT: DeclRefExpr {{.*}} 'int' ParmVar {{.*}} 'range' 'int'
+// CHECK-BINDING-NEXT: DeclRefExpr {{.*}} 'unsigned int' ParmVar {{.*}} 'index' 'unsigned int'
+// CHECK-BINDING-NEXT: DeclRefExpr {{.*}} 'const char *' ParmVar {{.*}} 'name' 'const char *'
+// CHECK-BINDING-NEXT: ReturnStmt
+// CHECK-BINDING-NEXT: DeclRefExpr {{.*}} 'hlsl::[[RESOURCE]]<element_type>' lvalue Var {{.*}} 'tmp' 'hlsl::[[RESOURCE]]<element_type>'
+// CHECK-BINDING-NEXT: AlwaysInlineAttr {{.*}} Implicit always_inline
 
 // Static __createFromImplicitBinding method
 
-// CHECK: CXXMethodDecl {{.*}} __createFromImplicitBinding 'hlsl::[[RESOURCE]]<element_type> (unsigned int, unsigned int, int, unsigned int, const char *)' static
-// CHECK-NEXT: ParmVarDecl {{.*}} orderId 'unsigned int'
-// CHECK-NEXT: ParmVarDecl {{.*}} spaceNo 'unsigned int'
-// CHECK-NEXT: ParmVarDecl {{.*}} range 'int'
-// CHECK-NEXT: ParmVarDecl {{.*}} index 'unsigned int'
-// CHECK-NEXT: ParmVarDecl {{.*}} name 'const char *'
-// CHECK-NEXT: CompoundStmt {{.*}}
-// CHECK-NEXT: DeclStmt {{.*}}
-// CHECK-NEXT: VarDecl {{.*}} tmp 'hlsl::[[RESOURCE]]<element_type>'
-// CHECK-NEXT: BinaryOperator {{.*}} '__hlsl_resource_t {{.*}}]]' '='
-// CHECK-NEXT: MemberExpr {{.*}} '__hlsl_resource_t {{.*}}' lvalue .__handle
-// CHECK-NEXT: DeclRefExpr {{.*}} 'hlsl::[[RESOURCE]]<element_type>' lvalue Var {{.*}} 'tmp' 'hlsl::[[RESOURCE]]<element_type>'
-// CHECK-NEXT: CallExpr {{.*}} '__hlsl_resource_t {{.*}}'
-// CHECK-NEXT: ImplicitCastExpr {{.*}} 'void (*)(...) noexcept' <BuiltinFnToFnPtr>
-// CHECK-NEXT: DeclRefExpr {{.*}} '<builtin fn type>' Function {{.*}} '__builtin_hlsl_resource_handlefromimplicitbinding' 'void (...) noexcept'
-// CHECK-NEXT: MemberExpr {{.*}} '__hlsl_resource_t {{.*}}' lvalue .__handle
-// CHECK-NEXT: DeclRefExpr {{.*}} 'hlsl::[[RESOURCE]]<element_type>' lvalue Var {{.*}} 'tmp' 'hlsl::[[RESOURCE]]<element_type>'
-// CHECK-NEXT: DeclRefExpr {{.*}} 'unsigned int' ParmVar {{.*}} 'orderId' 'unsigned int'
-// CHECK-NEXT: DeclRefExpr {{.*}} 'unsigned int' ParmVar {{.*}} 'spaceNo' 'unsigned int'
-// CHECK-NEXT: DeclRefExpr {{.*}} 'int' ParmVar {{.*}} 'range' 'int'
-// CHECK-NEXT: DeclRefExpr {{.*}} 'unsigned int' ParmVar {{.*}} 'index' 'unsigned int'
-// CHECK-NEXT: DeclRefExpr {{.*}} 'const char *' ParmVar {{.*}} 'name' 'const char *'
-// CHECK-NEXT: ReturnStmt
-// CHECK-NEXT: DeclRefExpr {{.*}} 'hlsl::[[RESOURCE]]<element_type>' lvalue Var {{.*}} 'tmp' 'hlsl::[[RESOURCE]]<element_type>'
-// CHECK-NEXT: AlwaysInlineAttr {{.*}} Implicit always_inline
+// CHECK-BINDING: CXXMethodDecl {{.*}} __createFromImplicitBinding 'hlsl::[[RESOURCE]]<element_type> (unsigned int, unsigned int, int, unsigned int, const char *)' static
+// CHECK-BINDING-NEXT: ParmVarDecl {{.*}} orderId 'unsigned int'
+// CHECK-BINDING-NEXT: ParmVarDecl {{.*}} spaceNo 'unsigned int'
+// CHECK-BINDING-NEXT: ParmVarDecl {{.*}} range 'int'
+// CHECK-BINDING-NEXT: ParmVarDecl {{.*}} index 'unsigned int'
+// CHECK-BINDING-NEXT: ParmVarDecl {{.*}} name 'const char *'
+// CHECK-BINDING-NEXT: CompoundStmt {{.*}}
+// CHECK-BINDING-NEXT: DeclStmt {{.*}}
+// CHECK-BINDING-NEXT: VarDecl {{.*}} tmp 'hlsl::[[RESOURCE]]<element_type>'
+// CHECK-BINDING-NEXT: BinaryOperator {{.*}} '__hlsl_resource_t {{.*}}]]' '='
+// CHECK-BINDING-NEXT: MemberExpr {{.*}} '__hlsl_resource_t {{.*}}' lvalue .__handle
+// CHECK-BINDING-NEXT: DeclRefExpr {{.*}} 'hlsl::[[RESOURCE]]<element_type>' lvalue Var {{.*}} 'tmp' 'hlsl::[[RESOURCE]]<element_type>'
+// CHECK-BINDING-NEXT: CallExpr {{.*}} '__hlsl_resource_t {{.*}}'
+// CHECK-BINDING-NEXT: ImplicitCastExpr {{.*}} 'void (*)(...) noexcept' <BuiltinFnToFnPtr>
+// CHECK-BINDING-NEXT: DeclRefExpr {{.*}} '<builtin fn type>' Function {{.*}} '__builtin_hlsl_resource_handlefromimplicitbinding' 'void (...) noexcept'
+// CHECK-BINDING-NEXT: MemberExpr {{.*}} '__hlsl_resource_t {{.*}}' lvalue .__handle
+// CHECK-BINDING-NEXT: DeclRefExpr {{.*}} 'hlsl::[[RESOURCE]]<element_type>' lvalue Var {{.*}} 'tmp' 'hlsl::[[RESOURCE]]<element_type>'
+// CHECK-BINDING-NEXT: DeclRefExpr {{.*}} 'unsigned int' ParmVar {{.*}} 'orderId' 'unsigned int'
+// CHECK-BINDING-NEXT: DeclRefExpr {{.*}} 'unsigned int' ParmVar {{.*}} 'spaceNo' 'unsigned int'
+// CHECK-BINDING-NEXT: DeclRefExpr {{.*}} 'int' ParmVar {{.*}} 'range' 'int'
+// CHECK-BINDING-NEXT: DeclRefExpr {{.*}} 'unsigned int' ParmVar {{.*}} 'index' 'unsigned int'
+// CHECK-BINDING-NEXT: DeclRefExpr {{.*}} 'const char *' ParmVar {{.*}} 'name' 'const char *'
+// CHECK-BINDING-NEXT: ReturnStmt
+// CHECK-BINDING-NEXT: DeclRefExpr {{.*}} 'hlsl::[[RESOURCE]]<element_type>' lvalue Var {{.*}} 'tmp' 'hlsl::[[RESOURCE]]<element_type>'
+// CHECK-BINDING-NEXT: AlwaysInlineAttr {{.*}} Implicit always_inline
+
+// CHECK-COUNTER-HANDLE: CXXMethodDecl {{.*}} __createFromBindingWithImplicitCounter 'hlsl::[[RESOURCE]]<element_type> (unsigned int, unsigned int, int, unsigned int, const char *, unsigned int)' static
+// CHECK-COUNTER-HANDLE-NEXT: ParmVarDecl {{.*}} registerNo 'unsigned int'
+// CHECK-COUNTER-HANDLE-NEXT: ParmVarDecl {{.*}} spaceNo 'unsigned int'
+// CHECK-COUNTER-HANDLE-NEXT: ParmVarDecl {{.*}} range 'int'
+// CHECK-COUNTER-HANDLE-NEXT: ParmVarDecl {{.*}} index 'unsigned int'
+// CHECK-COUNTER-HANDLE-NEXT: ParmVarDecl {{.*}} name 'const char *'
+// CHECK-COUNTER-HANDLE-NEXT: ParmVarDecl {{.*}} counterOrderId 'unsigned int'
+// CHECK-COUNTER-HANDLE-NEXT: CompoundStmt
+// CHECK-COUNTER-HANDLE-NEXT: DeclStmt
+// CHECK-COUNTER-HANDLE-NEXT: VarDecl {{.*}} tmp 'hlsl::[[RESOURCE]]<element_type>'
+// CHECK-COUNTER-HANDLE-NEXT: BinaryOperator {{.*}} '__hlsl_resource_t {{.*}}]]' '='
+// CHECK-COUNTER-HANDLE-NEXT: MemberExpr {{.*}} '__hlsl_resource_t {{.*}}' lvalue .__handle
+// CHECK-COUNTER-HANDLE-NEXT: DeclRefExpr {{.*}} 'hlsl::[[RESOURCE]]<element_type>' lvalue Var {{.*}} 'tmp' 'hlsl::[[RESOURCE]]<element_type>'
+// CHECK-COUNTER-HANDLE-NEXT: CallExpr {{.*}} '__hlsl_resource_t {{.*}}'
+// CHECK-COUNTER-HANDLE-NEXT: ImplicitCastExpr {{.*}} 'void (*)(...) noexcept' <BuiltinFnToFnPtr>
+// CHECK-COUNTER-HANDLE-NEXT: DeclRefExpr {{.*}} '<builtin fn type>' Function {{.*}} '__builtin_hlsl_resource_handlefrombinding' 'void (...) noexcept'
+// CHECK-COUNTER-HANDLE-NEXT: MemberExpr {{.*}} '__hlsl_resource_t {{.*}}' lvalue .__handle
+// CHECK-COUNTER-HANDLE-NEXT: DeclRefExpr {{.*}} 'hlsl::[[RESOURCE]]<element_type>' lvalue Var {{.*}} 'tmp' 'hlsl::[[RESOURCE]]<element_type>'
+// CHECK-COUNTER-HANDLE-NEXT: DeclRefExpr {{.*}} 'unsigned int' ParmVar {{.*}} 'registerNo' 'unsigned int'
+// CHECK-COUNTER-HANDLE-NEXT: DeclRefExpr {{.*}} 'unsigned int' ParmVar {{.*}} 'spaceNo' 'unsigned int'
+// CHECK-COUNTER-HANDLE-NEXT: DeclRefExpr {{.*}} 'int' ParmVar {{.*}} 'range' 'int'
+// CHECK-COUNTER-HANDLE-NEXT: DeclRefExpr {{.*}} 'unsigned int' ParmVar {{.*}} 'index' 'unsigned int'
+// CHECK-COUNTER-HANDLE-NEXT: DeclRefExpr {{.*}} 'const char *' ParmVar {{.*}} 'name' 'const char *'
+// CHECK-COUNTER-HANDLE-NEXT: BinaryOperator {{.*}} '__hlsl_resource_t {{.*}}]]' '='
+// CHECK-COUNTER-HANDLE-NEXT: MemberExpr {{.*}} '__hlsl_resource_t {{.*}}' lvalue .__counter_handle
+// CHECK-COUNTER-HANDLE-NEXT: DeclRefExpr {{.*}} 'hlsl::[[RESOURCE]]<element_type>' lvalue Var {{.*}} 'tmp' 'hlsl::[[RESOURCE]]<element_type>'
+// CHECK-COUNTER-HANDLE-NEXT: CallExpr {{.*}} '__hlsl_resource_t {{.*}}'
+// CHECK-COUNTER-HANDLE-NEXT: ImplicitCastExpr {{.*}} 'void (*)(...) noexcept' <BuiltinFnToFnPtr>
+// CHECK-COUNTER-HANDLE-NEXT: DeclRefExpr {{.*}} '<builtin fn type>' Function {{.*}} '__builtin_hlsl_resource_counterhandlefromimplicitbinding' 'void (...) noexcept'
+// CHECK-COUNTER-HANDLE-NEXT: MemberExpr {{.*}} '__hlsl_resource_t {{.*}}' lvalue .__handle
+// CHECK-COUNTER-HANDLE-NEXT: DeclRefExpr {{.*}} 'hlsl::[[RESOURCE]]<element_type>' lvalue Var {{.*}} 'tmp' 'hlsl::[[RESOURCE]]<element_type>'
+// CHECK-COUNTER-HANDLE-NEXT: DeclRefExpr {{.*}} 'unsigned int' ParmVar {{.*}} 'counterOrderId' 'unsigned int'
+// CHECK-COUNTER-HANDLE-NEXT: DeclRefExpr {{.*}} 'unsigned int' ParmVar {{.*}} 'spaceNo' 'unsigned int'
+// CHECK-COUNTER-HANDLE-NEXT: ReturnStmt
+// CHECK-COUNTER-HANDLE-NEXT: DeclRefExpr {{.*}} 'hlsl::[[RESOURCE]]<element_type>' lvalue Var {{.*}} 'tmp' 'hlsl::[[RESOURCE]]<element_type>'
+// CHECK-COUNTER-HANDLE-NEXT: AlwaysInlineAttr {{.*}} Implicit always_inline
+
+// CHECK-COUNTER-HANDLE: CXXMethodDecl {{.*}} __createFromImplicitBindingWithImplicitCounter 'hlsl::[[RESOURCE]]<element_type> (unsigned int, unsigned int, int, unsigned int, const char *, unsigned int)' static
+// CHECK-COUNTER-HANDLE-NEXT: ParmVarDecl {{.*}} orderId 'unsigned int'
+// CHECK-COUNTER-HANDLE-NEXT: ParmVarDecl {{.*}} spaceNo 'unsigned int'
+// CHECK-COUNTER-HANDLE-NEXT: ParmVarDecl {{.*}} range 'int'
+// CHECK-COUNTER-HANDLE-NEXT: ParmVarDecl {{.*}} index 'unsigned int'
+// CHECK-COUNTER-HANDLE-NEXT: ParmVarDecl {{.*}} name 'const char *'
+// CHECK-COUNTER-HANDLE-NEXT: ParmVarDecl {{.*}} counterOrderId 'unsigned int'
+// CHECK-COUNTER-HANDLE-NEXT: CompoundStmt
+// CHECK-COUNTER-HANDLE-NEXT: DeclStmt
+// CHECK-COUNTER-HANDLE-NEXT: VarDecl {{.*}} tmp 'hlsl::[[RESOURCE]]<element_type>'
+// CHECK-COUNTER-HANDLE-NEXT: BinaryOperator {{.*}} '__hlsl_resource_t {{.*}}]]' '='
+// CHECK-COUNTER-HANDLE-NEXT: MemberExpr {{.*}} '__hlsl_resource_t {{.*}}' lvalue .__handle
+// CHECK-COUNTER-HANDLE-NEXT: DeclRefExpr {{.*}} 'hlsl::[[RESOURCE]]<element_type>' lvalue Var {{.*}} 'tmp' 'hlsl::[[RESOURCE]]<element_type>'
+// CHECK-COUNTER-HANDLE-NEXT: CallExpr {{.*}} '__hlsl_resource_t {{.*}}'
+// CHECK-COUNTER-HANDLE-NEXT: ImplicitCastExpr {{.*}} 'void (*)(...) noexcept' <BuiltinFnToFnPtr>
+// CHECK-COUNTER-HANDLE-NEXT: DeclRefExpr {{.*}} '<builtin fn type>' Function {{.*}} '__builtin_hlsl_resource_handlefromimplicitbinding' 'void (...) noexcept'
+// CHECK-COUNTER-HANDLE-NEXT: MemberExpr {{.*}} '__hlsl_resource_t {{.*}}' lvalue .__handle
+// CHECK-COUNTER-HANDLE-NEXT: DeclRefExpr {{.*}} 'hlsl::[[RESOURCE]]<element_type>' lvalue Var {{.*}} 'tmp' 'hlsl::[[RESOURCE]]<element_type>'
+// CHECK-COUNTER-HANDLE-NEXT: DeclRefExpr {{.*}} 'unsigned int' ParmVar {{.*}} 'orderId' 'unsigned int'
+// CHECK-COUNTER-HANDLE-NEXT: DeclRefExpr {{.*}} 'unsigned int' ParmVar {{.*}} 'spaceNo' 'unsigned int'
+// CHECK-COUNTER-HANDLE-NEXT: DeclRefExpr {{.*}} 'int' ParmVar {{.*}} 'range' 'int'
+// CHECK-COUNTER-HANDLE-NEXT: DeclRefExpr {{.*}} 'unsigned int' ParmVar {{.*}} 'index' 'unsigned int'
+// CHECK-COUNTER-HANDLE-NEXT: DeclRefExpr {{.*}} 'const char *' ParmVar {{.*}} 'name' 'const char *'
+// CHECK-COUNTER-HANDLE-NEXT: BinaryOperator {{.*}} '__hlsl_resource_t {{.*}}]]' '='
+// CHECK-COUNTER-HANDLE-NEXT: MemberExpr {{.*}} '__hlsl_resource_t {{.*}}' lvalue .__counter_handle
+// CHECK-COUNTER-HANDLE-NEXT: DeclRefExpr {{.*}} 'hlsl::[[RESOURCE]]<element_type>' lvalue Var {{.*}} 'tmp' 'hlsl::[[RESOURCE]]<element_type>'
+// CHECK-COUNTER-HANDLE-NEXT: CallExpr {{.*}} '__hlsl_resource_t {{.*}}'
+// CHECK-COUNTER-HANDLE-NEXT: ImplicitCastExpr {{.*}} 'void (*)(...) noexcept' <BuiltinFnToFnPtr>
+// CHECK-COUNTER-HANDLE-NEXT: DeclRefExpr {{.*}} '<builtin fn type>' Function {{.*}} '__builtin_hlsl_resource_counterhandlefromimplicitbinding' 'void (...) noexcept'
+// CHECK-COUNTER-HANDLE-NEXT: MemberExpr {{.*}} '__hlsl_resource_t {{.*}}' lvalue .__handle
+// CHECK-COUNTER-HANDLE-NEXT: DeclRefExpr {{.*}} 'hlsl::[[RESOURCE]]<element_type>' lvalue Var {{.*}} 'tmp' 'hlsl::[[RESOURCE]]<element_type>'
+// CHECK-COUNTER-HANDLE-NEXT: DeclRefExpr {{.*}} 'unsigned int' ParmVar {{.*}} 'counterOrderId' 'unsigned int'
+// CHECK-COUNTER-HANDLE-NEXT: DeclRefExpr {{.*}} 'unsigned int' ParmVar {{.*}} 'spaceNo' 'unsigned int'
+// CHECK-COUNTER-HANDLE-NEXT: ReturnStmt
+// CHECK-COUNTER-HANDLE-NEXT: DeclRefExpr {{.*}} 'hlsl::[[RESOURCE]]<element_type>' lvalue Var {{.*}} 'tmp' 'hlsl::[[RESOURCE]]<element_type>'
+// CHECK-COUNTER-HANDLE-NEXT: AlwaysInlineAttr {{.*}} Implicit always_inline
 
 // Subscript operators
 
@@ -263,7 +337,7 @@ RESOURCE<float> Buffer;
 // CHECK-COUNTER-NEXT: MemberExpr {{.*}} '__hlsl_resource_t
 // CHECK-COUNTER-SAME{LITERAL}: [[hlsl::resource_class(UAV)]]
 // CHECK-COUNTER-SAME{LITERAL}: [[hlsl::raw_buffer]]
-// CHECK-COUNTER-SAME{LITERAL}: [[hlsl::contained_type(element_type)]]' lvalue .__handle
+// CHECK-COUNTER-SAME{LITERAL}: [[hlsl::contained_type(element_type)]]' lvalue .__counter_handle
 // CHECK-COUNTER-NEXT: CXXThisExpr {{.*}} 'hlsl::RWStructuredBuffer<element_type>' lvalue implicit this
 // CHECK-COUNTER-NEXT: IntegerLiteral {{.*}} 'int' 1
 // CHECK-COUNTER-NEXT: AlwaysInlineAttr {{.*}} Implicit always_inline
@@ -279,7 +353,7 @@ RESOURCE<float> Buffer;
 // CHECK-COUNTER-NEXT: MemberExpr {{.*}} '__hlsl_resource_t
 // CHECK-COUNTER-SAME{LITERAL}: [[hlsl::resource_class(UAV)]]
 // CHECK-COUNTER-SAME{LITERAL}: [[hlsl::raw_buffer]]
-// CHECK-COUNTER-SAME{LITERAL}: [[hlsl::contained_type(element_type)]]' lvalue .__handle
+// CHECK-COUNTER-SAME{LITERAL}: [[hlsl::contained_type(element_type)]]' lvalue .__counter_handle
 // CHECK-COUNTER-NEXT: CXXThisExpr {{.*}} 'hlsl::RWStructuredBuffer<element_type>' lvalue implicit this
 // CHECK-COUNTER-NEXT: IntegerLiteral {{.*}} 'int' -1
 // CHECK-COUNTER-NEXT: AlwaysInlineAttr {{.*}} Implicit always_inline
@@ -305,7 +379,7 @@ RESOURCE<float> Buffer;
 // CHECK-APPEND-NEXT: MemberExpr {{.*}} '__hlsl_resource_t
 // CHECK-APPEND-SAME{LITERAL}: [[hlsl::resource_class(UAV)]]
 // CHECK-APPEND-SAME{LITERAL}: [[hlsl::raw_buffer]]
-// CHECK-APPEND-SAME{LITERAL}: [[hlsl::contained_type(element_type)]]' lvalue .__handle
+// CHECK-APPEND-SAME{LITERAL}: [[hlsl::contained_type(element_type)]]' lvalue .__counter_handle
 // CHECK-APPEND-NEXT: CXXThisExpr {{.*}} 'hlsl::[[RESOURCE]]<element_type>' lvalue implicit this
 // CHECK-APPEND-NEXT: IntegerLiteral {{.*}} 'int' 1
 // CHECK-APPEND-NEXT: DeclRefExpr {{.*}} 'element_type' ParmVar {{.*}} 'value' 'element_type'
@@ -330,7 +404,7 @@ RESOURCE<float> Buffer;
 // CHECK-CONSUME-NEXT: MemberExpr {{.*}} '__hlsl_resource_t
 // CHECK-CONSUME-SAME{LITERAL}: [[hlsl::resource_class(UAV)]]
 // CHECK-CONSUME-SAME{LITERAL}: [[hlsl::raw_buffer]]
-// CHECK-CONSUME-SAME{LITERAL}: [[hlsl::contained_type(element_type)]]' lvalue .__handle
+// CHECK-CONSUME-SAME{LITERAL}: [[hlsl::contained_type(element_type)]]' lvalue .__counter_handle
 // CHECK-CONSUME-NEXT: CXXThisExpr {{.*}} 'hlsl::[[RESOURCE]]<element_type>' lvalue implicit this
 // CHECK-CONSUME-NEXT: IntegerLiteral {{.*}} 'int' -1
 
diff --git a/clang/test/AST/HLSL/vk_binding_attr.hlsl b/clang/test/AST/HLSL/vk_binding_attr.hlsl
index 13e7544..da5a42c 100644
--- a/clang/test/AST/HLSL/vk_binding_attr.hlsl
+++ b/clang/test/AST/HLSL/vk_binding_attr.hlsl
@@ -76,13 +76,13 @@
 
 // CHECK: VarDecl {{.*}} Buf6 'RWStructuredBuffer<int>':'hlsl::RWStructuredBuffer<int>'
 // CHECK-NEXT: CallExpr {{.*}} 'RWStructuredBuffer<int>':'hlsl::RWStructuredBuffer<int>'
-// CHECK-NEXT: ImplicitCastExpr {{.*}} 'hlsl::RWStructuredBuffer<int> (*)(unsigned int, unsigned int, int, unsigned int, const char *)' <FunctionToPointerDecay>
-// SPV-NEXT: DeclRefExpr {{.*}} 'hlsl::RWStructuredBuffer<int> (unsigned int, unsigned int, int, unsigned int, const char *)' 
-// SPV-NEXT-SAME: CXXMethod {{.*}} '__createFromBinding' 'hlsl::RWStructuredBuffer<int> (unsigned int, unsigned int, int, unsigned int, const char *)'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'hlsl::RWStructuredBuffer<int> (*)(unsigned int, unsigned int, int, unsigned int, const char *, unsigned int)' <FunctionToPointerDecay>
+// SPV-NEXT: DeclRefExpr {{.*}} 'hlsl::RWStructuredBuffer<int> (unsigned int, unsigned int, int, unsigned int, const char *, unsigned int)' 
+// SPV-NEXT-SAME: CXXMethod {{.*}} '__createFromBindingwithImplicitCounter' 'hlsl::RWStructuredBuffer<int> (unsigned int, unsigned int, int, unsigned int, const char *, unsigned int)'
 // SPV-NEXT: IntegerLiteral {{.*}} 'unsigned int' 26
 // SPV-NEXT: IntegerLiteral {{.*}} 'unsigned int' 105
-// DXIL-NEXT: DeclRefExpr {{.*}} 'hlsl::RWStructuredBuffer<int> (unsigned int, unsigned int, int, unsigned int, const char *)' 
-// DXIL-NEXT-SAME: CXXMethod {{.*}} '__createFromBinding' 'hlsl::RWStructuredBuffer<int> (unsigned int, unsigned int, int, unsigned int, const char *)'
+// DXIL-NEXT: DeclRefExpr {{.*}} 'hlsl::RWStructuredBuffer<int> (unsigned int, unsigned int, int, unsigned int, const char *, unsigned int)' 
+// DXIL-NEXT-SAME: CXXMethod {{.*}} '__createFromImplicitBindingwithImplicitCounter' 'hlsl::RWStructuredBuffer<int> (unsigned int, unsigned int, int, unsigned int, const char *, unsigned int)'
 // DXIL-NEXT: IntegerLiteral {{.*}} 'unsigned int' 4
 // DXIL-NEXT: IntegerLiteral {{.*}} 'unsigned int' 0
 // CHECK: HLSLVkBindingAttr {{.*}} 26 105
diff --git a/clang/test/Analysis/Checkers/WebKit/unretained-members.mm b/clang/test/Analysis/Checkers/WebKit/unretained-members.mm
index adf1d8a..2b120b9 100644
--- a/clang/test/Analysis/Checkers/WebKit/unretained-members.mm
+++ b/clang/test/Analysis/Checkers/WebKit/unretained-members.mm
@@ -113,7 +113,6 @@ namespace ptr_to_ptr_to_retained {
   // expected-warning@-1{{Instance variable 'dispatch' in 'AnotherObject' is a retainable type 'dispatch_queue_t'}}
 }
 @property(nonatomic, readonly, strong) NSString *prop_string;
-// expected-warning@-1{{Property 'prop_string' in 'AnotherObject' is a raw pointer to retainable type 'NSString'; member variables must be a RetainPtr}}
 @property(nonatomic, readonly) NSString *prop_safe;
 @end
 
@@ -132,7 +131,6 @@ namespace ptr_to_ptr_to_retained {
   // expected-warning@-1{{Instance variable 'os_dispatch' in 'DerivedObject' is a retainable type 'dispatch_queue_t'}}
 }
 @property(nonatomic, strong) NSNumber *prop_number;
-// expected-warning@-1{{Property 'prop_number' in 'DerivedObject' is a raw pointer to retainable type 'NSNumber'; member variables must be a RetainPtr}}
 @property(nonatomic, readonly) NSString *prop_string;
 @end
 
@@ -178,12 +176,12 @@ NS_REQUIRES_PROPERTY_DEFINITIONS
 }
 @property(nonatomic, readonly, strong) NSString *prop_string1;
 @property(nonatomic, readonly, strong) NSString *prop_string2;
-// expected-warning@-1{{Property 'prop_string2' in 'NoSynthObject' is a raw pointer to retainable type 'NSString'}}
 @property(nonatomic, assign) NSString *prop_string3;
 // expected-warning@-1{{Property 'prop_string3' in 'NoSynthObject' is a raw pointer to retainable type 'NSString'; member variables must be a RetainPtr}}
 @property(nonatomic, unsafe_unretained) NSString *prop_string4;
 // expected-warning@-1{{Property 'prop_string4' in 'NoSynthObject' is a raw pointer to retainable type 'NSString'; member variables must be a RetainPtr}}
-@property(nonatomic, readonly, strong) NSString *dispatch;
+@property(nonatomic, copy) NSString *prop_string5;
+@property(nonatomic, readonly, strong) dispatch_queue_t dispatch;
 @end
 
 @implementation NoSynthObject
@@ -193,6 +191,7 @@ NS_REQUIRES_PROPERTY_DEFINITIONS
 @synthesize prop_string2;
 @synthesize prop_string3;
 @synthesize prop_string4;
+@synthesize prop_string5;
 - (dispatch_queue_t)dispatch {
   return nil;
 }
diff --git a/clang/test/CIR/CodeGen/dtors.cpp b/clang/test/CIR/CodeGen/dtors.cpp
index 49952a7..7fb0975 100644
--- a/clang/test/CIR/CodeGen/dtors.cpp
+++ b/clang/test/CIR/CodeGen/dtors.cpp
@@ -208,3 +208,84 @@ void test_nested_dtor() {
 // OGCG: define {{.*}} void @_ZN1DD2Ev
 // OGCG:   %[[C:.*]] = getelementptr inbounds i8, ptr %{{.*}}, i64 4
 // OGCG:   call void @_ZN1CD1Ev(ptr {{.*}} %[[C]])
+
+struct E {
+  ~E();
+};
+
+struct F : public E {
+  int n;
+  ~F() {}
+};
+
+// CIR: cir.func {{.*}} @_ZN1FD2Ev
+// CIR:   %[[BASE_E:.*]] = cir.base_class_addr %{{.*}} : !cir.ptr<!rec_F> nonnull [0] -> !cir.ptr<!rec_E>
+// CIR:   cir.call @_ZN1ED2Ev(%[[BASE_E]]) nothrow : (!cir.ptr<!rec_E>) -> ()
+
+// Because E is at offset 0 in F, there is no getelementptr needed.
+
+// LLVM: define {{.*}} void @_ZN1FD2Ev
+// LLVM:   call void @_ZN1ED2Ev(ptr %{{.*}})
+
+// This destructor is defined after the calling function in OGCG.
+
+void test_base_dtor_call() {
+  F f;
+}
+
+// CIR: cir.func {{.*}} @_Z19test_base_dtor_callv()
+//   cir.call @_ZN1FD2Ev(%{{.*}}) nothrow : (!cir.ptr<!rec_F>) -> ()
+
+// LLVM: define {{.*}} void @_Z19test_base_dtor_callv()
+// LLVM:   call void @_ZN1FD2Ev(ptr %{{.*}})
+
+// OGCG: define {{.*}} void @_Z19test_base_dtor_callv()
+// OGCG:   call void @_ZN1FD2Ev(ptr {{.*}} %{{.*}})
+
+// OGCG: define {{.*}} void @_ZN1FD2Ev
+// OGCG:   call void @_ZN1ED2Ev(ptr {{.*}} %{{.*}})
+
+struct VirtualBase {
+  ~VirtualBase();
+};
+
+struct Derived : virtual VirtualBase {
+  ~Derived() {}
+};
+
+void test_base_dtor_call_virtual_base() {
+  Derived d;
+}
+
+// Derived D2 (base) destructor -- does not call VirtualBase destructor
+
+// CIR:     cir.func {{.*}} @_ZN7DerivedD2Ev
+// CIR-NOT:   cir.call{{.*}} @_ZN11VirtualBaseD2Ev
+// CIR:       cir.return
+
+// LLVM:     define {{.*}} void @_ZN7DerivedD2Ev
+// LLVM-NOT:   call{{.*}} @_ZN11VirtualBaseD2Ev
+// LLVM:       ret
+
+// Derived D1 (complete) destructor -- does call VirtualBase destructor
+
+// CIR: cir.func {{.*}} @_ZN7DerivedD1Ev
+// CIR:   %[[THIS:.*]] = cir.load %{{.*}}
+// CIR:   %[[VTT:.*]] = cir.vtt.address_point @_ZTT7Derived, offset = 0 -> !cir.ptr<!cir.ptr<!void>>
+// CIR:   cir.call @_ZN7DerivedD2Ev(%[[THIS]], %[[VTT]])
+// CIR:   %[[VIRTUAL_BASE:.*]] = cir.base_class_addr %[[THIS]] : !cir.ptr<!rec_Derived> nonnull [0] -> !cir.ptr<!rec_VirtualBase>
+// CIR:   cir.call @_ZN11VirtualBaseD2Ev(%[[VIRTUAL_BASE]])
+
+// LLVM: define {{.*}} void @_ZN7DerivedD1Ev
+// LLVM:   call void @_ZN7DerivedD2Ev(ptr %{{.*}}, ptr @_ZTT7Derived)
+// LLVM:   call void @_ZN11VirtualBaseD2Ev(ptr %{{.*}})
+
+// OGCG emits these destructors in reverse order
+
+// OGCG: define {{.*}} void @_ZN7DerivedD1Ev
+// OGCG:   call void @_ZN7DerivedD2Ev(ptr {{.*}} %{{.*}}, ptr {{.*}} @_ZTT7Derived)
+// OGCG:   call void @_ZN11VirtualBaseD2Ev(ptr {{.*}} %{{.*}})
+
+// OGCG:     define {{.*}} void @_ZN7DerivedD2Ev
+// OGCG-NOT:   call{{.*}} @_ZN11VirtualBaseD2Ev
+// OGCG:       ret
diff --git a/clang/test/CIR/CodeGen/dynamic-cast.cpp b/clang/test/CIR/CodeGen/dynamic-cast.cpp
new file mode 100644
index 0000000..e5244b2
--- /dev/null
+++ b/clang/test/CIR/CodeGen/dynamic-cast.cpp
@@ -0,0 +1,28 @@
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -std=c++20 -fclangir -emit-cir -mmlir --mlir-print-ir-before=cir-lowering-prepare %s -o %t.cir 2> %t.before.log
+// RUN: FileCheck %s --input-file=%t.before.log -check-prefix=CIR-BEFORE
+
+struct Base {
+  virtual ~Base();
+};
+
+struct Derived : Base {};
+
+// CIR-BEFORE-DAG: !rec_Base = !cir.record
+// CIR-BEFORE-DAG: !rec_Derived = !cir.record
+// CIR-BEFORE-DAG: #dyn_cast_info__ZTI4Base__ZTI7Derived = #cir.dyn_cast_info<src_rtti = #cir.global_view<@_ZTI4Base> : !cir.ptr<!u8i>, dest_rtti = #cir.global_view<@_ZTI7Derived> : !cir.ptr<!u8i>, runtime_func = @__dynamic_cast, bad_cast_func = @__cxa_bad_cast, offset_hint = #cir.int<0> : !s64i>
+
+Derived *ptr_cast(Base *b) {
+  return dynamic_cast<Derived *>(b);
+}
+
+// CIR-BEFORE: cir.func dso_local @_Z8ptr_castP4Base
+// CIR-BEFORE:   %{{.+}} = cir.dyn_cast ptr %{{.+}} : !cir.ptr<!rec_Base> -> !cir.ptr<!rec_Derived> #dyn_cast_info__ZTI4Base__ZTI7Derived
+// CIR-BEFORE: }
+
+Derived &ref_cast(Base &b) {
+  return dynamic_cast<Derived &>(b);
+}
+
+// CIR-BEFORE: cir.func dso_local @_Z8ref_castR4Base
+// CIR-BEFORE:   %{{.+}} = cir.dyn_cast ref %{{.+}} : !cir.ptr<!rec_Base> -> !cir.ptr<!rec_Derived> #dyn_cast_info__ZTI4Base__ZTI7Derived
+// CIR-BEFORE: }
diff --git a/clang/test/CIR/CodeGen/global-init.cpp b/clang/test/CIR/CodeGen/global-init.cpp
index 2afb5a5..0aab695 100644
--- a/clang/test/CIR/CodeGen/global-init.cpp
+++ b/clang/test/CIR/CodeGen/global-init.cpp
@@ -6,6 +6,23 @@
 // RUN: %clang_cc1 -std=c++17 -triple x86_64-unknown-linux-gnu -emit-llvm %s -o %t.ll
 // RUN: FileCheck --input-file=%t.ll %s --check-prefix=OGCG
 
+// Declarations that appear before global-specific definitions
+
+// CIR: module @{{.*}} attributes {
+// CIR-SAME: cir.global_ctors = [#cir.global_ctor<"_GLOBAL__sub_I_[[FILENAME:.*]]", 65535>]
+
+// LLVM: @__dso_handle = external hidden global i8
+// LLVM: @needsCtor = global %struct.NeedsCtor zeroinitializer, align 1
+// LLVM: @needsDtor = global %struct.NeedsDtor zeroinitializer, align 1
+// LLVM: @needsCtorDtor = global %struct.NeedsCtorDtor zeroinitializer, align 1
+// LLVM: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 65535, ptr @_GLOBAL__sub_I_[[FILENAME:.*]], ptr null }]
+
+// OGCG: @needsCtor = global %struct.NeedsCtor zeroinitializer, align 1
+// OGCG: @needsDtor = global %struct.NeedsDtor zeroinitializer, align 1
+// OGCG: @__dso_handle = external hidden global i8
+// OGCG: @needsCtorDtor = global %struct.NeedsCtorDtor zeroinitializer, align 1
+// OGCG: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 65535, ptr @_GLOBAL__sub_I_[[FILENAME:.*]], ptr null }]
+
 struct NeedsCtor {
   NeedsCtor();
 };
@@ -16,34 +33,89 @@ NeedsCtor needsCtor;
 // CIR-BEFORE-LPP:   %[[THIS:.*]] = cir.get_global @needsCtor : !cir.ptr<!rec_NeedsCtor>
 // CIR-BEFORE-LPP:   cir.call @_ZN9NeedsCtorC1Ev(%[[THIS]]) : (!cir.ptr<!rec_NeedsCtor>) -> ()
 
-// CIR: module @{{.*}} attributes {
-// CIR-SAME: cir.global_ctors = [#cir.global_ctor<"_GLOBAL__sub_I_[[FILENAME:.*]]", 65535>]
-
 // CIR: cir.global external @needsCtor = #cir.zero : !rec_NeedsCtor
 // CIR: cir.func internal private @__cxx_global_var_init() {
 // CIR:   %0 = cir.get_global @needsCtor : !cir.ptr<!rec_NeedsCtor>
 // CIR:   cir.call @_ZN9NeedsCtorC1Ev(%0) : (!cir.ptr<!rec_NeedsCtor>) -> ()
 
-// CIR: cir.func private @_GLOBAL__sub_I_[[FILENAME:.*]]() {
-// CIR:   cir.call @__cxx_global_var_init() : () -> ()
-// CIR:   cir.return
-// CIR: }
-
-// LLVM: @needsCtor = global %struct.NeedsCtor zeroinitializer, align 1
-// LLVM: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 65535, ptr @_GLOBAL__sub_I_[[FILENAME:.*]], ptr null }]
-// LLVM: declare void @_ZN9NeedsCtorC1Ev(ptr)
-
 // LLVM: define internal void @__cxx_global_var_init()
 // LLVM:   call void @_ZN9NeedsCtorC1Ev(ptr @needsCtor)
 
-// LLVM: define void @_GLOBAL__sub_I_[[FILENAME]]()
-// LLVM:   call void @__cxx_global_var_init()
-
-// OGCG: @needsCtor = global %struct.NeedsCtor zeroinitializer, align 1
-// OGCG: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 65535, ptr @_GLOBAL__sub_I_[[FILENAME:.*]], ptr null }]
-
 // OGCG: define internal void @__cxx_global_var_init() {{.*}} section ".text.startup" {
 // OGCG:   call void @_ZN9NeedsCtorC1Ev(ptr noundef nonnull align 1 dereferenceable(1) @needsCtor)
 
+
+struct NeedsDtor {
+  ~NeedsDtor();
+};
+
+NeedsDtor needsDtor;
+
+// CIR-BEFORE-LPP: cir.global external @needsDtor = #cir.zero : !rec_NeedsDtor dtor {
+// CIR-BEFORE-LPP:   %[[THIS:.*]] = cir.get_global @needsDtor : !cir.ptr<!rec_NeedsDtor>
+// CIR-BEFORE-LPP:   cir.call @_ZN9NeedsDtorD1Ev(%[[THIS]]) : (!cir.ptr<!rec_NeedsDtor>) -> ()
+
+// CIR: cir.global external @needsDtor = #cir.zero : !rec_NeedsDtor
+// CIR: cir.func internal private @__cxx_global_var_init.1() {
+// CIR:   %[[OBJ:.*]] = cir.get_global @needsDtor : !cir.ptr<!rec_NeedsDtor>
+// CIR:   %[[DTOR:.*]] = cir.get_global @_ZN9NeedsDtorD1Ev : !cir.ptr<!cir.func<(!cir.ptr<!rec_NeedsDtor>)>>
+// CIR:   %[[DTOR_CAST:.*]] = cir.cast bitcast %[[DTOR]] : !cir.ptr<!cir.func<(!cir.ptr<!rec_NeedsDtor>)>> -> !cir.ptr<!cir.func<(!cir.ptr<!void>)>>
+// CIR:   %[[OBJ_CAST:.*]] = cir.cast bitcast %[[OBJ]] : !cir.ptr<!rec_NeedsDtor> -> !cir.ptr<!void>
+// CIR:   %[[HANDLE:.*]] = cir.get_global @__dso_handle : !cir.ptr<i8>
+// CIR:   cir.call @__cxa_atexit(%[[DTOR_CAST]], %[[OBJ_CAST]], %[[HANDLE]]) : (!cir.ptr<!cir.func<(!cir.ptr<!void>)>>, !cir.ptr<!void>, !cir.ptr<i8>) -> ()
+
+// LLVM: define internal void @__cxx_global_var_init.1() {
+// LLVM:   call void @__cxa_atexit(ptr @_ZN9NeedsDtorD1Ev, ptr @needsDtor, ptr @__dso_handle)
+
+// OGCG: define internal void @__cxx_global_var_init.1() {{.*}} section ".text.startup" {
+// OGCG:   %{{.*}} = call i32 @__cxa_atexit(ptr @_ZN9NeedsDtorD1Ev, ptr @needsDtor, ptr @__dso_handle)
+
+struct NeedsCtorDtor {
+  NeedsCtorDtor();
+  ~NeedsCtorDtor();
+};
+
+NeedsCtorDtor needsCtorDtor;
+
+// CIR-BEFORE-LPP: cir.global external @needsCtorDtor = ctor : !rec_NeedsCtorDtor {
+// CIR-BEFORE-LPP:   %[[THIS:.*]] = cir.get_global @needsCtorDtor : !cir.ptr<!rec_NeedsCtorDtor>
+// CIR-BEFORE-LPP:   cir.call @_ZN13NeedsCtorDtorC1Ev(%[[THIS]]) : (!cir.ptr<!rec_NeedsCtorDtor>) -> ()
+// CIR-BEFORE-LPP: } dtor {
+// CIR-BEFORE-LPP:   %[[THIS:.*]] = cir.get_global @needsCtorDtor : !cir.ptr<!rec_NeedsCtorDtor>
+// CIR-BEFORE-LPP:   cir.call @_ZN13NeedsCtorDtorD1Ev(%[[THIS]]) : (!cir.ptr<!rec_NeedsCtorDtor>) -> ()
+
+// CIR: cir.global external @needsCtorDtor = #cir.zero : !rec_NeedsCtorDtor
+// CIR: cir.func internal private @__cxx_global_var_init.2() {
+// CIR:   %[[OBJ:.*]] = cir.get_global @needsCtorDtor : !cir.ptr<!rec_NeedsCtorDtor>
+// CIR:   cir.call @_ZN13NeedsCtorDtorC1Ev(%[[OBJ]]) : (!cir.ptr<!rec_NeedsCtorDtor>) -> ()
+// CIR:   %[[OBJ:.*]] = cir.get_global @needsCtorDtor : !cir.ptr<!rec_NeedsCtorDtor>
+// CIR:   %[[DTOR:.*]] = cir.get_global @_ZN13NeedsCtorDtorD1Ev : !cir.ptr<!cir.func<(!cir.ptr<!rec_NeedsCtorDtor>)>>
+// CIR:   %[[DTOR_CAST:.*]] = cir.cast bitcast %[[DTOR]] : !cir.ptr<!cir.func<(!cir.ptr<!rec_NeedsCtorDtor>)>> -> !cir.ptr<!cir.func<(!cir.ptr<!void>)>>
+// CIR:   %[[OBJ_CAST:.*]] = cir.cast bitcast %[[OBJ]] : !cir.ptr<!rec_NeedsCtorDtor> -> !cir.ptr<!void>
+// CIR:   %[[HANDLE:.*]] = cir.get_global @__dso_handle : !cir.ptr<i8>
+// CIR:   cir.call @__cxa_atexit(%[[DTOR_CAST]], %[[OBJ_CAST]], %[[HANDLE]]) : (!cir.ptr<!cir.func<(!cir.ptr<!void>)>>, !cir.ptr<!void>, !cir.ptr<i8>) -> ()
+
+// LLVM: define internal void @__cxx_global_var_init.2() {
+// LLVM:   call void @_ZN13NeedsCtorDtorC1Ev(ptr @needsCtorDtor)
+// LLVM:   call void @__cxa_atexit(ptr @_ZN13NeedsCtorDtorD1Ev, ptr @needsCtorDtor, ptr @__dso_handle)
+
+// OGCG: define internal void @__cxx_global_var_init.2() {{.*}} section ".text.startup" {
+// OGCG:   call void @_ZN13NeedsCtorDtorC1Ev(ptr noundef nonnull align 1 dereferenceable(1) @needsCtorDtor)
+// OGCG:   %{{.*}} = call i32 @__cxa_atexit(ptr @_ZN13NeedsCtorDtorD1Ev, ptr @needsCtorDtor, ptr @__dso_handle)
+
+// Common init function for all globals with default priority
+
+// CIR: cir.func private @_GLOBAL__sub_I_[[FILENAME:.*]]() {
+// CIR:   cir.call @__cxx_global_var_init() : () -> ()
+// CIR:   cir.call @__cxx_global_var_init.1() : () -> ()
+// CIR:   cir.call @__cxx_global_var_init.2() : () -> ()
+
+// LLVM: define void @_GLOBAL__sub_I_[[FILENAME]]()
+// LLVM:   call void @__cxx_global_var_init()
+// LLVM:   call void @__cxx_global_var_init.1()
+// LLVM:   call void @__cxx_global_var_init.2()
+
 // OGCG: define internal void @_GLOBAL__sub_I_[[FILENAME]]() {{.*}} section ".text.startup" {
 // OGCG:   call void @__cxx_global_var_init()
+// OGCG:   call void @__cxx_global_var_init.1()
+// OGCG:   call void @__cxx_global_var_init.2()
diff --git a/clang/test/CIR/CodeGen/record-zero-init-padding.c b/clang/test/CIR/CodeGen/record-zero-init-padding.c
new file mode 100644
index 0000000..f131c9b
--- /dev/null
+++ b/clang/test/CIR/CodeGen/record-zero-init-padding.c
@@ -0,0 +1,90 @@
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir -emit-cir %s -o %t.cir
+// RUN: FileCheck --input-file=%t.cir %s -check-prefix=CIR
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir -emit-llvm %s -o %t-cir.ll
+// RUN: FileCheck --input-file=%t-cir.ll %s -check-prefix=LLVM
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-llvm %s -o %t.ll
+// RUN: FileCheck --input-file=%t.ll %s -check-prefix=OGCG
+
+struct padding_after_field {
+  char c;
+  int i;
+};
+
+struct bitfield_with_padding {
+  unsigned int a : 3;
+  unsigned int b : 5;
+  int c;
+};
+
+struct tail_padding {
+  int a;
+  char b;
+};
+
+struct multiple_padding {
+  char a;
+  short b;
+  long long c;
+};
+
+void test_zero_init_padding(void) {
+  static const struct padding_after_field paf = {1, 42};
+  static const struct bitfield_with_padding bfp = {1, 2, 99};
+  static const struct tail_padding tp = {10, 20};
+  static const struct multiple_padding mp = {5, 10, 100};
+}
+
+// Type definitions for anonymous structs with padding
+// CIR-DAG: !rec_anon_struct = !cir.record<struct  {!s8i, !u8i, !s16i, !cir.array<!u8i x 4>, !s64i}>
+// CIR-DAG: !rec_anon_struct1 = !cir.record<struct  {!s32i, !s8i, !cir.array<!u8i x 3>}>
+// CIR-DAG: !rec_anon_struct2 = !cir.record<struct  {!u8i, !cir.array<!u8i x 3>, !s32i}>
+// CIR-DAG: !rec_anon_struct3 = !cir.record<struct  {!s8i, !cir.array<!u8i x 3>, !s32i}>
+
+// paf: char + 3 bytes padding + int -> uses !rec_anon_struct3
+// CIR-DAG: cir.global "private" internal dso_local @test_zero_init_padding.paf = #cir.const_record<{
+// CIR-DAG-SAME:   #cir.int<1> : !s8i,
+// CIR-DAG-SAME:   #cir.const_array<[#cir.zero : !u8i, #cir.zero : !u8i, #cir.zero : !u8i]> : !cir.array<!u8i x 3>,
+// CIR-DAG-SAME:   #cir.int<42> : !s32i
+// CIR-DAG-SAME: }> : !rec_anon_struct3
+
+// bfp: unsigned bitfield byte + 3 bytes padding + int -> uses !rec_anon_struct2
+// CIR-DAG: cir.global "private" internal dso_local @test_zero_init_padding.bfp = #cir.const_record<{
+// CIR-DAG-SAME:   #cir.int<17> : !u8i,
+// CIR-DAG-SAME:   #cir.const_array<[#cir.zero : !u8i, #cir.zero : !u8i, #cir.zero : !u8i]> : !cir.array<!u8i x 3>,
+// CIR-DAG-SAME:   #cir.int<99> : !s32i
+// CIR-DAG-SAME: }> : !rec_anon_struct2
+
+// tp: int + char + 3 bytes tail padding -> uses !rec_anon_struct1
+// CIR-DAG: cir.global "private" internal dso_local @test_zero_init_padding.tp = #cir.const_record<{
+// CIR-DAG-SAME:   #cir.int<10> : !s32i,
+// CIR-DAG-SAME:   #cir.int<20> : !s8i,
+// CIR-DAG-SAME:   #cir.const_array<[#cir.zero : !u8i, #cir.zero : !u8i, #cir.zero : !u8i]> : !cir.array<!u8i x 3>
+// CIR-DAG-SAME: }> : !rec_anon_struct1
+
+// mp: char + 1 byte padding + short + 4 bytes padding + long long -> uses !rec_anon_struct
+// CIR-DAG: cir.global "private" internal dso_local @test_zero_init_padding.mp = #cir.const_record<{
+// CIR-DAG-SAME:   #cir.int<5> : !s8i,
+// CIR-DAG-SAME:   #cir.zero : !u8i,
+// CIR-DAG-SAME:   #cir.int<10> : !s16i,
+// CIR-DAG-SAME:   #cir.const_array<[#cir.zero : !u8i, #cir.zero : !u8i, #cir.zero : !u8i, #cir.zero : !u8i]> : !cir.array<!u8i x 4>,
+// CIR-DAG-SAME:   #cir.int<100> : !s64i
+// CIR-DAG-SAME: }> : !rec_anon_struct
+
+// CIR-LABEL: cir.func {{.*}}@test_zero_init_padding
+// CIR:   cir.return
+
+// LLVM-DAG: @test_zero_init_padding.paf = internal global { i8, [3 x i8], i32 } { i8 1, [3 x i8] zeroinitializer, i32 42 }
+// LLVM-DAG: @test_zero_init_padding.bfp = internal global { i8, [3 x i8], i32 } { i8 17, [3 x i8] zeroinitializer, i32 99 }
+// LLVM-DAG: @test_zero_init_padding.tp = internal global { i32, i8, [3 x i8] } { i32 10, i8 20, [3 x i8] zeroinitializer }
+// LLVM-DAG: @test_zero_init_padding.mp = internal global { i8, i8, i16, [4 x i8], i64 } { i8 5, i8 0, i16 10, [4 x i8] zeroinitializer, i64 100 }
+
+// LLVM-LABEL: define{{.*}} void @test_zero_init_padding
+// LLVM:   ret void
+
+// OGCG-DAG: @test_zero_init_padding.paf = internal constant { i8, [3 x i8], i32 } { i8 1, [3 x i8] zeroinitializer, i32 42 }
+// OGCG-DAG: @test_zero_init_padding.bfp = internal constant { i8, [3 x i8], i32 } { i8 17, [3 x i8] zeroinitializer, i32 99 }
+// OGCG-DAG: @test_zero_init_padding.tp = internal constant { i32, i8, [3 x i8] } { i32 10, i8 20, [3 x i8] zeroinitializer }
+// OGCG-DAG: @test_zero_init_padding.mp = internal constant { i8, i8, i16, [4 x i8], i64 } { i8 5, i8 0, i16 10, [4 x i8] zeroinitializer, i64 100 }
+
+// OGCG-LABEL: define{{.*}} void @test_zero_init_padding
+// OGCG:   ret void
diff --git a/clang/test/CodeGenHLSL/resources/StructuredBuffers-constructors.hlsl b/clang/test/CodeGenHLSL/resources/StructuredBuffers-constructors.hlsl
index 89a66b0..96c2d95 100644
--- a/clang/test/CodeGenHLSL/resources/StructuredBuffers-constructors.hlsl
+++ b/clang/test/CodeGenHLSL/resources/StructuredBuffers-constructors.hlsl
@@ -1,8 +1,7 @@
 // RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -emit-llvm -disable-llvm-passes -o - %s | \
 // RUN:   llvm-cxxfilt | FileCheck %s --check-prefixes=CHECK,CHECK-DXIL
-// FIXME: SPIR-V codegen of llvm.spv.resource.handlefrombinding and resource types is not yet implemented
-// RUN-DISABLED: %clang_cc1 -triple spirv-vulkan-library -x hlsl -emit-llvm -disable-llvm-passes -o - %s | \
-//        llvm-cxxfilt | FileCheck %s --check-prefixes=CHECK,CHECK-SPIRV
+// RUN: %clang_cc1 -triple spirv-vulkan-library -x hlsl -emit-llvm -disable-llvm-passes -o - %s | \
+// RUN:   llvm-cxxfilt | FileCheck %s --check-prefixes=CHECK,CHECK-SPV
 
 // NOTE: Itanium ABI for C++ requires Clang to generate 2 constructors types to support polymorphism:
 // - C1 - Complete object constructor - constructs the complete object, including virtual base classes.
@@ -24,52 +23,65 @@ export void foo() {
 // CHECK-DXIL: %"class.hlsl::RWStructuredBuffer" = type { target("dx.RawBuffer", float, 1, 0), target("dx.RawBuffer", float, 1, 0) }
 // CHECK-DXIL: %"class.hlsl::AppendStructuredBuffer" = type { target("dx.RawBuffer", float, 1, 0), target("dx.RawBuffer", float, 1, 0) }
 
-// CHECK: @Buf1 = internal global %"class.hlsl::StructuredBuffer" poison, align 4
+// CHECK: @Buf1 = internal global %"class.hlsl::StructuredBuffer" poison
 // CHECK: @[[Buf1Str:.*]] = private unnamed_addr constant [5 x i8] c"Buf1\00", align 1
-// CHECK: @Buf2 = internal global %"class.hlsl::RWStructuredBuffer" poison, align 4
+// CHECK: @Buf2 = internal global %"class.hlsl::RWStructuredBuffer" poison
 // CHECK: @[[Buf2Str:.*]] = private unnamed_addr constant [5 x i8] c"Buf2\00", align 1
 
 // Buf1 initialization part 1 - global init function that calls StructuredBuffer<float>::__createFromBinding
 // with explicit binding
-// CHECK: define internal void @__cxx_global_var_init()
+// CHECK: define internal {{.*}}void @__cxx_global_var_init()
 // CHECK-NEXT: entry:
-// CHECK-NEXT: call void @hlsl::StructuredBuffer<float>::__createFromBinding(unsigned int, unsigned int, int, unsigned int, char const*)
+// CHECK: call void @hlsl::StructuredBuffer<float>::__createFromBinding(unsigned int, unsigned int, int, unsigned int, char const*)
 // CHECK-SAME: (ptr {{.*}} @Buf1, i32 noundef 10, i32 noundef 2, i32 noundef 1, i32 noundef 0, ptr noundef @[[Buf1Str]])
 
 // Buf1 initialization part 2 - body of StructuredBuffer<float>::::__createFromBinding
 
 // CHECK: define {{.*}} void @hlsl::StructuredBuffer<float>::__createFromBinding(unsigned int, unsigned int, int, unsigned int, char const*)
-// CHECK-SAME: ptr {{.*}} sret(%"class.hlsl::StructuredBuffer") align 4 %[[RetValue1:.*]], i32 noundef %registerNo, 
+// CHECK-SAME: ptr {{.*}} sret(%"class.hlsl::StructuredBuffer") align {{(4|8)}} %[[RetValue1:.*]], i32 noundef %registerNo, 
 // CHECK-SAME: i32 noundef %spaceNo, i32 noundef %range, i32 noundef %index, ptr noundef %name)
-// CHECK: %[[Tmp1:.*]] = alloca %"class.hlsl::StructuredBuffer", align 4
+// CHECK: %[[Tmp1:.*]] = alloca %"class.hlsl::StructuredBuffer"
 // CHECK-DXIL: %[[Handle1:.*]] = call target("dx.RawBuffer", float, 0, 0) 
 // CHECK-DXIL-SAME: @llvm.dx.resource.handlefrombinding.tdx.RawBuffer_f32_0_0t(
 // CHECK: %__handle = getelementptr inbounds nuw %"class.hlsl::StructuredBuffer", ptr %[[Tmp1]], i32 0, i32 0
 // CHECK-DXIL: store target("dx.RawBuffer", float, 0, 0) %[[Handle1]], ptr %__handle, align 4
 // CHECK: call void @hlsl::StructuredBuffer<float>::StructuredBuffer(hlsl::StructuredBuffer<float> const&)(ptr {{.*}} %[[RetValue1]], ptr {{.*}} %[[Tmp1]])
 
-// Buf2 initialization part 1 - global init function that calls RWStructuredBuffer<float>::__createFromImplicitBinding
-// CHECK: define internal void @__cxx_global_var_init.1()
+// Buf2 initialization part 1 - global init function that calls RWStructuredBuffer<float>::__createFromImplicitBindingWithImplicitCounter
+// CHECK: define internal {{.*}}void @__cxx_global_var_init.1()
 // CHECK-NEXT: entry:
-// CHECK-NEXT: call void @hlsl::RWStructuredBuffer<float>::__createFromImplicitBinding(unsigned int, unsigned int, int, unsigned int, char const*)
-// CHECK-SAME: (ptr {{.*}} @Buf2, i32 noundef 0, i32 noundef 0, i32 noundef 1, i32 noundef 0, ptr noundef @[[Buf2Str]])
+// CHECK: call void @hlsl::RWStructuredBuffer<float>::__createFromImplicitBindingWithImplicitCounter(unsigned int, unsigned int, int, unsigned int, char const*, unsigned int)
+// CHECK-SAME: (ptr {{.*}} @Buf2, i32 noundef 0, i32 noundef 0, i32 noundef 1, i32 noundef 0, ptr noundef @[[Buf2Str]], i32 noundef 1)
 
-// Buf2 initialization part 2 - body of RWStructuredBuffer<float>::__createFromImplicitBinding
-// CHECK: define linkonce_odr hidden void @hlsl::RWStructuredBuffer<float>::__createFromImplicitBinding(unsigned int, unsigned int, int, unsigned int, char const*)
-// CHECK-SAME: (ptr {{.*}} sret(%"class.hlsl::RWStructuredBuffer") align 4 %[[RetValue2:.*]], i32 noundef %orderId, 
-// CHECK-SAME: i32 noundef %spaceNo, i32 noundef %range, i32 noundef %index, ptr noundef %name)
-// CHECK: %[[Tmp2:.*]] = alloca %"class.hlsl::RWStructuredBuffer", align 4
+// Buf2 initialization part 2 - body of RWStructuredBuffer<float>::__createFromImplicitBindingWithImplicitCounter
+// CHECK: define linkonce_odr hidden void @hlsl::RWStructuredBuffer<float>::__createFromImplicitBindingWithImplicitCounter(unsigned int, unsigned int, int, unsigned int, char const*, unsigned int)
+// CHECK-SAME: (ptr {{.*}} sret(%"class.hlsl::RWStructuredBuffer") align {{(4|8)}} %[[RetValue2:.*]], i32 noundef %orderId, 
+// CHECK-SAME: i32 noundef %spaceNo, i32 noundef %range, i32 noundef %index, ptr noundef %name, i32 noundef %counterOrderId)
+// CHECK: %[[Tmp2:.*]] = alloca %"class.hlsl::RWStructuredBuffer"
 // CHECK-DXIL: %[[Handle2:.*]] = call target("dx.RawBuffer", float, 1, 0)
 // CHECK-DXIL-SAME: @llvm.dx.resource.handlefromimplicitbinding.tdx.RawBuffer_f32_1_0t(
-// CHECK: %__handle = getelementptr inbounds nuw %"class.hlsl::RWStructuredBuffer", ptr %[[Tmp2]], i32 0, i32 0
-// CHECK-DXIL: store target("dx.RawBuffer", float, 1, 0) %[[Handle2]], ptr %__handle, align 4
+// CHECK-DXIL: %[[HandlePtr:.*]] = getelementptr inbounds nuw %"class.hlsl::RWStructuredBuffer", ptr %[[Tmp2]], i32 0, i32 0
+// CHECK-DXIL-NEXT: store target("dx.RawBuffer", float, 1, 0) %[[Handle2]], ptr %[[HandlePtr]], align 4
+// CHECK-DXIL: %[[HandlePtr:.*]] = getelementptr inbounds nuw %"class.hlsl::RWStructuredBuffer", ptr %[[Tmp2]], i32 0, i32 0
+// CHECK-DXIL: %[[LoadedHandle:.*]] = load target("dx.RawBuffer", float, 1, 0), ptr %[[HandlePtr]], align 4
+// CHECK-DXIL: %[[CounterHandlePtr:.*]] = getelementptr inbounds nuw %"class.hlsl::RWStructuredBuffer", ptr %[[Tmp2]], i32 0, i32 1
+// CHECK-DXIL-NEXT: store target("dx.RawBuffer", float, 1, 0) %[[LoadedHandle]], ptr %[[CounterHandlePtr]], align 4
+// CHECK-SPV: %[[Handle2:.*]] = call target("spirv.VulkanBuffer", [0 x float], 12, 1)
+// CHECK-SPV-SAME: @llvm.spv.resource.handlefromimplicitbinding.tspirv.VulkanBuffer_a0f32_12_1t(
+// CHECK-SPV: %[[HandlePtr:.*]] = getelementptr inbounds nuw %"class.hlsl::RWStructuredBuffer", ptr %[[Tmp2]], i32 0, i32 0
+// CHECK-SPV-NEXT: store target("spirv.VulkanBuffer", [0 x float], 12, 1) %[[Handle2]], ptr %[[HandlePtr]], align 8
+// CHECK-SPV: %[[HandlePtr:.*]] = getelementptr inbounds nuw %"class.hlsl::RWStructuredBuffer", ptr %[[Tmp2]], i32 0, i32 0
+// CHECK-SPV: %[[LoadedHandle:.*]] = load target("spirv.VulkanBuffer", [0 x float], 12, 1), ptr %[[HandlePtr]], align 8
+// CHECK-SPV: %[[CounterHandle:.*]] = call target("spirv.VulkanBuffer", i32, 12, 1) @llvm.spv.resource.counterhandlefromimplicitbinding
+// CHECK-SPV: %[[CounterHandlePtr:.*]] = getelementptr inbounds nuw %"class.hlsl::RWStructuredBuffer", ptr %[[Tmp2]], i32 0, i32 1
+// CHECK-SPV-NEXT: store target("spirv.VulkanBuffer", i32, 12, 1) %[[CounterHandle]], ptr %[[CounterHandlePtr]], align 8
 // CHECK: call void @hlsl::RWStructuredBuffer<float>::RWStructuredBuffer(hlsl::RWStructuredBuffer<float> const&)(ptr {{.*}} %[[RetValue2]], ptr {{.*}} %[[Tmp2]])
 
 // Buf3 initialization part 1 - local variable declared in function foo() is initialized by 
 // AppendStructuredBuffer<float> C1 default constructor
-// CHECK: define void @foo()
+// CHECK: define {{.*}}void @foo()
 // CHECK-NEXT: entry:
-// CHECK-NEXT: %Buf3 = alloca %"class.hlsl::AppendStructuredBuffer", align 4
+// CHECK: %Buf3 = alloca %"class.hlsl::AppendStructuredBuffer", align {{4|8}}
 // CHECK-NEXT: call void @hlsl::AppendStructuredBuffer<float>::AppendStructuredBuffer()(ptr {{.*}} %Buf3)
 
 // Buf3 initialization part 2 - body of AppendStructuredBuffer<float> default C1 constructor that calls
@@ -84,7 +96,7 @@ export void foo() {
 // CHECK-DXIL: store target("dx.RawBuffer", float, 1, 0) poison, ptr %__handle, align 4
 
 // Module initialization
-// CHECK: define internal void @_GLOBAL__sub_I_StructuredBuffers_constructors.hlsl()
+// CHECK: define internal {{.*}}void @_GLOBAL__sub_I_StructuredBuffers_constructors.hlsl()
 // CHECK-NEXT: entry:
-// CHECK-NEXT: call void @__cxx_global_var_init()
-// CHECK-NEXT: call void @__cxx_global_var_init.1()
+// CHECK: call {{.*}}void @__cxx_global_var_init()
+// CHECK-NEXT: call {{.*}}void @__cxx_global_var_init.1()
diff --git a/clang/test/CodeGenHLSL/resources/res-array-rw-counter.hlsl b/clang/test/CodeGenHLSL/resources/res-array-rw-counter.hlsl
new file mode 100644
index 0000000..c3d5784
--- /dev/null
+++ b/clang/test/CodeGenHLSL/resources/res-array-rw-counter.hlsl
@@ -0,0 +1,26 @@
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -emit-llvm -disable-llvm-passes -o - %s | FileCheck %s --check-prefixes=CHECK,CHECK-DXIL
+// RUN: %clang_cc1 -triple spirv-vulkan-library -x hlsl -emit-llvm -disable-llvm-passes -o - %s | FileCheck %s --check-prefixes=CHECK,CHECK-SPV
+
+// CHECK-DXIL: %"class.hlsl::RWStructuredBuffer" = type { target("dx.RawBuffer", float, 1, 0), target("dx.RawBuffer", float, 1, 0) }
+// CHECK-SPV: %"class.hlsl::RWStructuredBuffer" = type { target("spirv.VulkanBuffer", [0 x float], 12, 1), target("spirv.VulkanBuffer", i32, 12, 1) }
+
+RWStructuredBuffer<float> BufArray[4];
+
+export void foo(int idx) {
+  BufArray[0].IncrementCounter();
+  BufArray[idx].DecrementCounter();
+}
+
+// CHECK: @[[BufArrayStr:.*]] = private unnamed_addr constant [9 x i8] c"BufArray\00", align 1
+
+// CHECK: define {{.*}}void @_Z3fooi(i32 noundef %[[IDX_ARG:.*]])
+// CHECK-NEXT: entry:
+// CHECK: %[[IDX_ADDR:.*]] = alloca i32
+// CHECK: [[TMP_INC:%.*]] = alloca %"class.hlsl::RWStructuredBuffer"
+// CHECK: [[TMP_DEC:%.*]] = alloca %"class.hlsl::RWStructuredBuffer"
+// CHECK: store i32 %[[IDX_ARG]], ptr %[[IDX_ADDR]]
+// CHECK: call void @_ZN4hlsl18RWStructuredBufferIfE46__createFromImplicitBindingWithImplicitCounterEjjijPKcj(ptr {{.*}} [[TMP_INC]], i32 noundef 0, i32 noundef 0, i32 noundef 4, i32 noundef 0, ptr noundef @[[BufArrayStr]], i32 noundef 1)
+// CHECK: call noundef i32 @_ZN4hlsl18RWStructuredBufferIfE16IncrementCounterEv(ptr {{.*}} [[TMP_INC]])
+// CHECK: %[[IDX_LOADED:.*]] = load i32, ptr %[[IDX_ADDR]]
+// CHECK: call void @_ZN4hlsl18RWStructuredBufferIfE46__createFromImplicitBindingWithImplicitCounterEjjijPKcj(ptr {{.*}} [[TMP_DEC]], i32 noundef 0, i32 noundef 0, i32 noundef 4, i32 noundef %[[IDX_LOADED]], ptr noundef @[[BufArrayStr]], i32 noundef 1)
+// CHECK: call noundef i32 @_ZN4hlsl18RWStructuredBufferIfE16DecrementCounterEv(ptr {{.*}} [[TMP_DEC]])
+\ No newline at end of file
diff --git a/clang/test/CodeGenHLSL/vk_binding_attr.hlsl b/clang/test/CodeGenHLSL/vk_binding_attr.hlsl
index bbef051..45955e1 100644
--- a/clang/test/CodeGenHLSL/vk_binding_attr.hlsl
+++ b/clang/test/CodeGenHLSL/vk_binding_attr.hlsl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -triple spirv-unknown-vulkan1.3-library -finclude-default-header -O3 -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple spirv-unknown-vulkan1.3-compute -finclude-default-header -O3 -emit-llvm -o - %s | FileCheck %s
 // CHECK: [[Buf:@.*]] = private unnamed_addr constant [4 x i8] c"Buf\00"
 // CHECK: [[Buf2:@.*]] = private unnamed_addr constant [5 x i8] c"Buf2\00"
 // CHECK: [[Buf3:@.*]] = private unnamed_addr constant [5 x i8] c"Buf3\00"
diff --git a/clang/test/SemaTemplate/concepts.cpp b/clang/test/SemaTemplate/concepts.cpp
index 3b7c138..768af09 100644
--- a/clang/test/SemaTemplate/concepts.cpp
+++ b/clang/test/SemaTemplate/concepts.cpp
@@ -1037,7 +1037,7 @@ void test() {
 
 namespace GH66612 {
   template<typename C>
-    auto end(C c) ->int;
+    auto end(C c) ->int; // expected-note {{possible target for call}}
 
   template <typename T>
     concept Iterator = true;
@@ -1047,9 +1047,8 @@ namespace GH66612 {
         { end } -> Iterator; // #66612GH_END
     };
 
-  static_assert(Container<int>);// expected-error{{static assertion failed}}
-  // expected-note@-1{{because 'int' does not satisfy 'Container'}}
-  // expected-note@#66612GH_END{{because 'end' would be invalid: reference to overloaded function could not be resolved; did you mean to call it?}}
+  static_assert(Container<int>);
+  // expected-error@#66612GH_END{{reference to overloaded function could not be resolved; did you mean to call it?}}
 }
 
 namespace GH66938 {
@@ -1407,7 +1406,6 @@ static_assert(!std::is_constructible_v<span<4>, array<int, 3>>);
 
 }
 
-
 namespace GH162125 {
 template<typename, int size>
 concept true_int = (size, true);
@@ -1444,3 +1442,37 @@ struct s {
 
 void(*test)(int) = &s<bool>::f<int>;
 }
+namespace GH51246 {
+void f(); // expected-note {{possible target for call}}
+void f(int); // expected-note {{possible target for call}}
+void g();
+static_assert(requires { f; }); // expected-error {{reference to overloaded function could not be resolved}}
+static_assert(requires { g; });
+struct S {
+    void mf() {
+        static_assert(requires { mf(); });
+        static_assert(requires { mf; });    // expected-error {{reference to non-static member function must be called}}
+        static_assert(requires { S::mf; }); // expected-error {{reference to non-static member function must be called}}
+    }
+    void mf2(int); // expected-note 2{{possible target for call}}
+    void mf2() { // expected-note 2{{possible target for call}}
+        static_assert(requires { mf2; });    // expected-error {{reference to non-static member function must be called}}
+        static_assert(requires { S::mf2; }); // expected-error {{reference to non-static member function must be called}}
+    }
+};
+
+} // namespace GH51246
+
+
+namespace GH97753 {
+
+void f(); // expected-note {{possible target for call}}
+void f(int); // expected-note {{possible target for call}}
+
+template<typename T>
+concept C = sizeof(T) == 42;
+
+static_assert( requires {{ &f } -> C;} ); // expected-error {{reference to overloaded function could not be resolved;}}
+// expected-error@-1 {{static assertion failed due to requirement 'requires { { &f() } -> C; }'}}
+
+}
diff --git a/clang/unittests/ASTMatchers/CMakeLists.txt b/clang/unittests/ASTMatchers/CMakeLists.txt
index 47bd5c1..955566c 100644
--- a/clang/unittests/ASTMatchers/CMakeLists.txt
+++ b/clang/unittests/ASTMatchers/CMakeLists.txt
@@ -3,7 +3,6 @@ add_clang_unittest(ASTMatchersTests
   ASTMatchersNodeTest.cpp
   ASTMatchersNarrowingTest.cpp
   ASTMatchersTraversalTest.cpp
-  GtestMatchersTest.cpp
   CLANG_LIBS
   clangAST
   clangASTMatchers
diff --git a/clang/unittests/ASTMatchers/GtestMatchersTest.cpp b/clang/unittests/ASTMatchers/GtestMatchersTest.cpp
deleted file mode 100644
index 5ee67a9..0000000
--- a/clang/unittests/ASTMatchers/GtestMatchersTest.cpp
+++ /dev/null
@@ -1,418 +0,0 @@
-//===- unittests/ASTMatchers/GTestMatchersTest.cpp - GTest matcher unit tests //
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "ASTMatchersTest.h"
-#include "clang/ASTMatchers/ASTMatchers.h"
-#include "clang/ASTMatchers/GtestMatchers.h"
-
-namespace clang {
-namespace ast_matchers {
-
-constexpr llvm::StringLiteral GtestMockDecls = R"cc(
-  static int testerr;
-
-#define GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
-    switch (0)                          \
-    case 0:                             \
-    default:  // NOLINT
-
-#define GTEST_NONFATAL_FAILURE_(code) testerr = code
-
-#define GTEST_FATAL_FAILURE_(code) testerr = code
-
-#define GTEST_ASSERT_(expression, on_failure) \
-    GTEST_AMBIGUOUS_ELSE_BLOCKER_               \
-    if (const int gtest_ar = (expression))      \
-      ;                                         \
-    else                                        \
-      on_failure(gtest_ar)
-
-  // Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT2.
-  // Don't use this in your code.
-#define GTEST_PRED_FORMAT2_(pred_format, v1, v2, on_failure) \
-    GTEST_ASSERT_(pred_format(#v1, #v2, v1, v2), on_failure)
-
-#define ASSERT_PRED_FORMAT2(pred_format, v1, v2) \
-    GTEST_PRED_FORMAT2_(pred_format, v1, v2, GTEST_FATAL_FAILURE_)
-#define EXPECT_PRED_FORMAT2(pred_format, v1, v2) \
-    GTEST_PRED_FORMAT2_(pred_format, v1, v2, GTEST_NONFATAL_FAILURE_)
-
-#define GTEST_PRED_FORMAT1_(pred_format, v1, on_failure) \
-  GTEST_ASSERT_(pred_format(#v1, v1), on_failure)
-
-#define EXPECT_PRED_FORMAT1(pred_format, v1) \
-  GTEST_PRED_FORMAT1_(pred_format, v1, GTEST_NONFATAL_FAILURE_)
-#define ASSERT_PRED_FORMAT1(pred_format, v1) \
-  GTEST_PRED_FORMAT1_(pred_format, v1, GTEST_FATAL_FAILURE_)
-
-#define EXPECT_EQ(val1, val2) \
-    EXPECT_PRED_FORMAT2(::testing::internal::EqHelper::Compare, val1, val2)
-#define EXPECT_NE(val1, val2) \
-    EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperNE, val1, val2)
-#define EXPECT_GE(val1, val2) \
-    EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperGE, val1, val2)
-#define EXPECT_GT(val1, val2) \
-    EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperGT, val1, val2)
-#define EXPECT_LE(val1, val2) \
-    EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperLE, val1, val2)
-#define EXPECT_LT(val1, val2) \
-    EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperLT, val1, val2)
-
-#define ASSERT_THAT(value, matcher) \
-  ASSERT_PRED_FORMAT1(              \
-      ::testing::internal::MakePredicateFormatterFromMatcher(matcher), value)
-#define EXPECT_THAT(value, matcher) \
-  EXPECT_PRED_FORMAT1(              \
-      ::testing::internal::MakePredicateFormatterFromMatcher(matcher), value)
-
-#define ASSERT_EQ(val1, val2) \
-    ASSERT_PRED_FORMAT2(::testing::internal::EqHelper::Compare, val1, val2)
-#define ASSERT_NE(val1, val2) \
-    ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperNE, val1, val2)
-
-#define GMOCK_ON_CALL_IMPL_(mock_expr, Setter, call)                    \
-  ((mock_expr).gmock_##call)(::testing::internal::GetWithoutMatchers(), \
-                             nullptr)                                   \
-      .Setter(nullptr, 0, #mock_expr, #call)
-
-#define ON_CALL(obj, call) \
-  GMOCK_ON_CALL_IMPL_(obj, InternalDefaultActionSetAt, call)
-
-#define EXPECT_CALL(obj, call) \
-  GMOCK_ON_CALL_IMPL_(obj, InternalExpectedAt, call)
-
-  namespace testing {
-  namespace internal {
-  class EqHelper {
-   public:
-    // This templatized version is for the general case.
-    template <typename T1, typename T2>
-    static int Compare(const char* lhs_expression, const char* rhs_expression,
-                       const T1& lhs, const T2& rhs) {
-      return 0;
-    }
-  };
-  template <typename T1, typename T2>
-  int CmpHelperNE(const char* expr1, const char* expr2, const T1& val1,
-                  const T2& val2) {
-    return 0;
-  }
-  template <typename T1, typename T2>
-  int CmpHelperGE(const char* expr1, const char* expr2, const T1& val1,
-                  const T2& val2) {
-    return 0;
-  }
-  template <typename T1, typename T2>
-  int CmpHelperGT(const char* expr1, const char* expr2, const T1& val1,
-                  const T2& val2) {
-    return 0;
-  }
-  template <typename T1, typename T2>
-  int CmpHelperLE(const char* expr1, const char* expr2, const T1& val1,
-                  const T2& val2) {
-    return 0;
-  }
-  template <typename T1, typename T2>
-  int CmpHelperLT(const char* expr1, const char* expr2, const T1& val1,
-                  const T2& val2) {
-    return 0;
-  }
-
-  // For implementing ASSERT_THAT() and EXPECT_THAT().  The template
-  // argument M must be a type that can be converted to a matcher.
-  template <typename M>
-  class PredicateFormatterFromMatcher {
-   public:
-    explicit PredicateFormatterFromMatcher(M m) : matcher_(m) {}
-
-    // This template () operator allows a PredicateFormatterFromMatcher
-    // object to act as a predicate-formatter suitable for using with
-    // Google Test's EXPECT_PRED_FORMAT1() macro.
-    template <typename T>
-    int operator()(const char* value_text, const T& x) const {
-      return 0;
-    }
-
-   private:
-    const M matcher_;
-  };
-
-  template <typename M>
-  inline PredicateFormatterFromMatcher<M> MakePredicateFormatterFromMatcher(
-      M matcher) {
-    return PredicateFormatterFromMatcher<M>(matcher);
-  }
-
-  bool GetWithoutMatchers() { return false; }
-
-  template <typename F>
-  class MockSpec {
-   public:
-    MockSpec<F>() {}
-
-    bool InternalDefaultActionSetAt(
-        const char* file, int line, const char* obj, const char* call) {
-      return false;
-    }
-
-    bool InternalExpectedAt(
-        const char* file, int line, const char* obj, const char* call) {
-      return false;
-    }
-
-    MockSpec<F> operator()(bool, void*) {
-      return *this;
-    }
-  };  // class MockSpec
-
-  }  // namespace internal
-
-  template <typename T>
-  int StrEq(T val) {
-    return 0;
-  }
-  template <typename T>
-  int Eq(T val) {
-    return 0;
-  }
-
-  }  // namespace testing
-
-  class Mock {
-    public:
-    Mock() {}
-    testing::internal::MockSpec<int> gmock_TwoArgsMethod(int, int) {
-      return testing::internal::MockSpec<int>();
-    }
-    testing::internal::MockSpec<int> gmock_TwoArgsMethod(bool, void*) {
-      return testing::internal::MockSpec<int>();
-    }
-  };  // class Mock
-)cc";
-
-static std::string wrapGtest(llvm::StringRef Input) {
-  return (GtestMockDecls + Input).str();
-}
-
-TEST(GtestAssertTest, ShouldMatchAssert) {
-  std::string Input = R"cc(
-    void Test() { ASSERT_EQ(1010, 4321); }
-  )cc";
-  EXPECT_TRUE(matches(wrapGtest(Input),
-                      gtestAssert(GtestCmp::Eq, integerLiteral(equals(1010)),
-                                  integerLiteral(equals(4321)))));
-}
-
-TEST(GtestAssertTest, ShouldNotMatchExpect) {
-  std::string Input = R"cc(
-    void Test() { EXPECT_EQ(2, 3); }
-  )cc";
-  EXPECT_TRUE(
-      notMatches(wrapGtest(Input), gtestAssert(GtestCmp::Eq, expr(), expr())));
-}
-
-TEST(GtestAssertTest, ShouldMatchNestedAssert) {
-  std::string Input = R"cc(
-    #define WRAPPER(a, b) ASSERT_EQ(a, b)
-    void Test() { WRAPPER(2, 3); }
-  )cc";
-  EXPECT_TRUE(
-      matches(wrapGtest(Input), gtestAssert(GtestCmp::Eq, expr(), expr())));
-}
-
-TEST(GtestExpectTest, ShouldMatchExpect) {
-  std::string Input = R"cc(
-    void Test() { EXPECT_EQ(1010, 4321); }
-  )cc";
-  EXPECT_TRUE(matches(wrapGtest(Input),
-                      gtestExpect(GtestCmp::Eq, integerLiteral(equals(1010)),
-                                  integerLiteral(equals(4321)))));
-}
-
-TEST(GtestExpectTest, ShouldNotMatchAssert) {
-  std::string Input = R"cc(
-    void Test() { ASSERT_EQ(2, 3); }
-  )cc";
-  EXPECT_TRUE(
-      notMatches(wrapGtest(Input), gtestExpect(GtestCmp::Eq, expr(), expr())));
-}
-
-TEST(GtestExpectTest, NeShouldMatchExpectNe) {
-  std::string Input = R"cc(
-    void Test() { EXPECT_NE(2, 3); }
-  )cc";
-  EXPECT_TRUE(
-      matches(wrapGtest(Input), gtestExpect(GtestCmp::Ne, expr(), expr())));
-}
-
-TEST(GtestExpectTest, LeShouldMatchExpectLe) {
-  std::string Input = R"cc(
-    void Test() { EXPECT_LE(2, 3); }
-  )cc";
-  EXPECT_TRUE(
-      matches(wrapGtest(Input), gtestExpect(GtestCmp::Le, expr(), expr())));
-}
-
-TEST(GtestExpectTest, LtShouldMatchExpectLt) {
-  std::string Input = R"cc(
-    void Test() { EXPECT_LT(2, 3); }
-  )cc";
-  EXPECT_TRUE(
-      matches(wrapGtest(Input), gtestExpect(GtestCmp::Lt, expr(), expr())));
-}
-
-TEST(GtestExpectTest, GeShouldMatchExpectGe) {
-  std::string Input = R"cc(
-    void Test() { EXPECT_GE(2, 3); }
-  )cc";
-  EXPECT_TRUE(
-      matches(wrapGtest(Input), gtestExpect(GtestCmp::Ge, expr(), expr())));
-}
-
-TEST(GtestExpectTest, GtShouldMatchExpectGt) {
-  std::string Input = R"cc(
-    void Test() { EXPECT_GT(2, 3); }
-  )cc";
-  EXPECT_TRUE(
-      matches(wrapGtest(Input), gtestExpect(GtestCmp::Gt, expr(), expr())));
-}
-
-TEST(GtestExpectTest, ThatShouldMatchAssertThat) {
-  std::string Input = R"cc(
-    using ::testing::Eq;
-    void Test() { ASSERT_THAT(2, Eq(2)); }
-  )cc";
-  EXPECT_TRUE(matches(
-      wrapGtest(Input),
-      gtestAssertThat(
-          expr(), callExpr(callee(functionDecl(hasName("::testing::Eq")))))));
-}
-
-TEST(GtestExpectTest, ThatShouldMatchExpectThat) {
-  std::string Input = R"cc(
-    using ::testing::Eq;
-    void Test() { EXPECT_THAT(2, Eq(2)); }
-  )cc";
-  EXPECT_TRUE(matches(
-      wrapGtest(Input),
-      gtestExpectThat(
-          expr(), callExpr(callee(functionDecl(hasName("::testing::Eq")))))));
-}
-
-TEST(GtestOnCallTest, CallShouldMatchOnCallWithoutParams1) {
-  std::string Input = R"cc(
-    void Test() {
-      Mock mock;
-      ON_CALL(mock, TwoArgsMethod);
-    }
-  )cc";
-  EXPECT_TRUE(matches(wrapGtest(Input),
-                      gtestOnCall(expr(hasType(cxxRecordDecl(hasName("Mock")))),
-                                  "TwoArgsMethod", MockArgs::None)));
-}
-
-TEST(GtestOnCallTest, CallShouldMatchOnCallWithoutParams2) {
-  std::string Input = R"cc(
-    void Test() {
-      Mock mock;
-      ON_CALL(mock, TwoArgsMethod);
-    }
-  )cc";
-  EXPECT_TRUE(matches(
-      wrapGtest(Input),
-      gtestOnCall(cxxMemberCallExpr(
-                      callee(functionDecl(hasName("gmock_TwoArgsMethod"))))
-                      .bind("mock_call"),
-                  MockArgs::None)));
-}
-
-TEST(GtestOnCallTest, CallShouldMatchOnCallWithParams1) {
-  std::string Input = R"cc(
-    void Test() {
-      Mock mock;
-      ON_CALL(mock, TwoArgsMethod(1, 2));
-    }
-  )cc";
-  EXPECT_TRUE(matches(wrapGtest(Input),
-                      gtestOnCall(expr(hasType(cxxRecordDecl(hasName("Mock")))),
-                                  "TwoArgsMethod", MockArgs::Some)));
-}
-
-TEST(GtestOnCallTest, CallShouldMatchOnCallWithParams2) {
-  std::string Input = R"cc(
-    void Test() {
-      Mock mock;
-      ON_CALL(mock, TwoArgsMethod(1, 2));
-    }
-  )cc";
-  EXPECT_TRUE(matches(
-      wrapGtest(Input),
-      gtestOnCall(cxxMemberCallExpr(
-                      callee(functionDecl(hasName("gmock_TwoArgsMethod"))))
-                      .bind("mock_call"),
-                  MockArgs::Some)));
-}
-
-TEST(GtestExpectCallTest, CallShouldMatchExpectCallWithoutParams1) {
-  std::string Input = R"cc(
-    void Test() {
-      Mock mock;
-      EXPECT_CALL(mock, TwoArgsMethod);
-    }
-  )cc";
-  EXPECT_TRUE(
-      matches(wrapGtest(Input),
-              gtestExpectCall(expr(hasType(cxxRecordDecl(hasName("Mock")))),
-                              "TwoArgsMethod", MockArgs::None)));
-}
-
-TEST(GtestExpectCallTest, CallShouldMatchExpectCallWithoutParams2) {
-  std::string Input = R"cc(
-    void Test() {
-      Mock mock;
-      EXPECT_CALL(mock, TwoArgsMethod);
-    }
-  )cc";
-  EXPECT_TRUE(matches(
-      wrapGtest(Input),
-      gtestExpectCall(cxxMemberCallExpr(
-                          callee(functionDecl(hasName("gmock_TwoArgsMethod"))))
-                          .bind("mock_call"),
-                      MockArgs::None)));
-}
-
-TEST(GtestExpectCallTest, CallShouldMatchExpectCallWithParams1) {
-  std::string Input = R"cc(
-    void Test() {
-      Mock mock;
-      EXPECT_CALL(mock, TwoArgsMethod(1, 2));
-    }
-  )cc";
-  EXPECT_TRUE(
-      matches(wrapGtest(Input),
-              gtestExpectCall(expr(hasType(cxxRecordDecl(hasName("Mock")))),
-                              "TwoArgsMethod", MockArgs::Some)));
-}
-
-TEST(GtestExpectCallTest, CallShouldMatchExpectCallWithParams2) {
-  std::string Input = R"cc(
-    void Test() {
-      Mock mock;
-      EXPECT_CALL(mock, TwoArgsMethod(1, 2));
-    }
-  )cc";
-  EXPECT_TRUE(matches(
-      wrapGtest(Input),
-      gtestExpectCall(cxxMemberCallExpr(
-                          callee(functionDecl(hasName("gmock_TwoArgsMethod"))))
-                          .bind("mock_call"),
-                      MockArgs::Some)));
-}
-
-} // end namespace ast_matchers
-} // end namespace clang
diff --git a/flang/include/flang/Lower/OpenACC.h b/flang/include/flang/Lower/OpenACC.h
index 4622dbc..69f1f5b 100644
--- a/flang/include/flang/Lower/OpenACC.h
+++ b/flang/include/flang/Lower/OpenACC.h
@@ -122,6 +122,11 @@ void genOpenACCTerminator(fir::FirOpBuilder &, mlir::Operation *,
 /// clause.
 uint64_t getLoopCountForCollapseAndTile(const Fortran::parser::AccClauseList &);
 
+/// Parse collapse clause and return {size, force}. If absent, returns
+/// {1,false}.
+std::pair<uint64_t, bool>
+getCollapseSizeAndForce(const Fortran::parser::AccClauseList &);
+
 /// Checks whether the current insertion point is inside OpenACC loop.
 bool isInOpenACCLoop(fir::FirOpBuilder &);
 
diff --git a/flang/include/flang/Parser/parse-tree.h b/flang/include/flang/Parser/parse-tree.h
index 1443e93..d919b77 100644
--- a/flang/include/flang/Parser/parse-tree.h
+++ b/flang/include/flang/Parser/parse-tree.h
@@ -3517,7 +3517,9 @@ struct OmpObject {
   std::variant<Designator, /*common block*/ Name, Invalid> u;
 };
 
-WRAPPER_CLASS(OmpObjectList, std::list<OmpObject>);
+struct OmpObjectList {
+  WRAPPER_CLASS_BOILERPLATE(OmpObjectList, std::list<OmpObject>);
+};
 
 // Ref: [4.5:201-207], [5.0:293-299], [5.1:325-331], [5.2:124]
 //
@@ -3547,14 +3549,18 @@ struct OmpTypeSpecifier {
   std::variant<TypeSpec, DeclarationTypeSpec> u;
 };
 
-WRAPPER_CLASS(OmpTypeNameList, std::list<OmpTypeSpecifier>);
+struct OmpTypeNameList {
+  WRAPPER_CLASS_BOILERPLATE(OmpTypeNameList, std::list<OmpTypeSpecifier>);
+};
 
 struct OmpLocator {
   UNION_CLASS_BOILERPLATE(OmpLocator);
   std::variant<OmpObject, FunctionReference> u;
 };
 
-WRAPPER_CLASS(OmpLocatorList, std::list<OmpLocator>);
+struct OmpLocatorList {
+  WRAPPER_CLASS_BOILERPLATE(OmpLocatorList, std::list<OmpLocator>);
+};
 
 // Ref: [4.5:58-60], [5.0:58-60], [5.1:63-68], [5.2:197-198], [6.0:334-336]
 //
@@ -4324,7 +4330,9 @@ struct OmpIteration {
 //
 // iteration-vector ->
 //    [iteration...]                                // since 4.5
-WRAPPER_CLASS(OmpIterationVector, std::list<OmpIteration>);
+struct OmpIterationVector {
+  WRAPPER_CLASS_BOILERPLATE(OmpIterationVector, std::list<OmpIteration>);
+};
 
 // Extract this into a separate structure (instead of having it directly in
 // OmpDoacrossClause), so that the context in TYPE_CONTEXT_PARSER can be set
@@ -4364,14 +4372,18 @@ struct OmpDependClause {
 //
 // doacross-clause ->
 //    DOACROSS(dependence-type: iteration-vector)   // since 5.2
-WRAPPER_CLASS(OmpDoacrossClause, OmpDoacross);
+struct OmpDoacrossClause {
+  WRAPPER_CLASS_BOILERPLATE(OmpDoacrossClause, OmpDoacross);
+};
 
 // Ref: [5.0:254-255], [5.1:287-288], [5.2:73]
 //
 // destroy-clause ->
 //    DESTROY |                                     // since 5.0, until 5.1
 //    DESTROY(variable)                             // since 5.2
-WRAPPER_CLASS(OmpDestroyClause, OmpObject);
+struct OmpDestroyClause {
+  WRAPPER_CLASS_BOILERPLATE(OmpDestroyClause, OmpObject);
+};
 
 // Ref: [5.0:135-140], [5.1:161-166], [5.2:265-266]
 //
@@ -4785,7 +4797,9 @@ struct OmpInitClause {
 // REF: [5.1:217-220], [5.2:294]
 //
 // 14.1.3 use-clause -> USE (interop-var)
-WRAPPER_CLASS(OmpUseClause, OmpObject);
+struct OmpUseClause {
+  WRAPPER_CLASS_BOILERPLATE(OmpUseClause, OmpObject);
+};
 
 // OpenMP Clauses
 struct OmpClause {
diff --git a/flang/include/flang/Semantics/symbol.h b/flang/include/flang/Semantics/symbol.h
index 975423b..98e2a5f 100644
--- a/flang/include/flang/Semantics/symbol.h
+++ b/flang/include/flang/Semantics/symbol.h
@@ -1126,6 +1126,9 @@ inline const DeclTypeSpec *Symbol::GetTypeImpl(int depth) const {
           [&](const HostAssocDetails &x) {
             return x.symbol().GetTypeImpl(depth);
           },
+          [&](const GenericDetails &x) {
+            return x.specific() ? x.specific()->GetTypeImpl(depth) : nullptr;
+          },
           [](const auto &) -> const DeclTypeSpec * { return nullptr; },
       },
       details_);
diff --git a/flang/lib/Lower/Bridge.cpp b/flang/lib/Lower/Bridge.cpp
index 3a022e1..50a687c 100644
--- a/flang/lib/Lower/Bridge.cpp
+++ b/flang/lib/Lower/Bridge.cpp
@@ -3192,15 +3192,20 @@ private:
         std::get_if<Fortran::parser::OpenACCCombinedConstruct>(&acc.u);
 
     Fortran::lower::pft::Evaluation *curEval = &getEval();
+    // Determine collapse depth/force and loopCount
+    bool collapseForce = false;
+    uint64_t collapseDepth = 1;
+    uint64_t loopCount = 1;
 
     if (accLoop || accCombined) {
-      uint64_t loopCount;
       if (accLoop) {
         const Fortran::parser::AccBeginLoopDirective &beginLoopDir =
             std::get<Fortran::parser::AccBeginLoopDirective>(accLoop->t);
         const Fortran::parser::AccClauseList &clauseList =
             std::get<Fortran::parser::AccClauseList>(beginLoopDir.t);
         loopCount = Fortran::lower::getLoopCountForCollapseAndTile(clauseList);
+        std::tie(collapseDepth, collapseForce) =
+            Fortran::lower::getCollapseSizeAndForce(clauseList);
       } else if (accCombined) {
         const Fortran::parser::AccBeginCombinedDirective &beginCombinedDir =
             std::get<Fortran::parser::AccBeginCombinedDirective>(
@@ -3208,6 +3213,8 @@ private:
         const Fortran::parser::AccClauseList &clauseList =
             std::get<Fortran::parser::AccClauseList>(beginCombinedDir.t);
         loopCount = Fortran::lower::getLoopCountForCollapseAndTile(clauseList);
+        std::tie(collapseDepth, collapseForce) =
+            Fortran::lower::getCollapseSizeAndForce(clauseList);
       }
 
       if (curEval->lowerAsStructured()) {
@@ -3217,8 +3224,63 @@ private:
       }
     }
 
-    for (Fortran::lower::pft::Evaluation &e : curEval->getNestedEvaluations())
-      genFIR(e);
+    const bool isStructured = curEval && curEval->lowerAsStructured();
+    if (isStructured && collapseForce && collapseDepth > 1) {
+      // force: collect prologue/epilogue for the first collapseDepth nested
+      // loops and sink them into the innermost loop body at that depth
+      llvm::SmallVector<Fortran::lower::pft::Evaluation *> prologue, epilogue;
+      Fortran::lower::pft::Evaluation *parent = &getEval();
+      Fortran::lower::pft::Evaluation *innermostLoopEval = nullptr;
+      for (uint64_t lvl = 0; lvl + 1 < collapseDepth; ++lvl) {
+        epilogue.clear();
+        auto &kids = parent->getNestedEvaluations();
+        // Collect all non-loop statements before the next inner loop as
+        // prologue, then mark remaining siblings as epilogue and descend into
+        // the inner loop.
+        Fortran::lower::pft::Evaluation *childLoop = nullptr;
+        for (auto it = kids.begin(); it != kids.end(); ++it) {
+          if (it->getIf<Fortran::parser::DoConstruct>()) {
+            childLoop = &*it;
+            for (auto it2 = std::next(it); it2 != kids.end(); ++it2)
+              epilogue.push_back(&*it2);
+            break;
+          }
+          prologue.push_back(&*it);
+        }
+        // Semantics guarantees collapseDepth does not exceed nest depth
+        // so childLoop must be found here.
+        assert(childLoop && "Expected inner DoConstruct for collapse");
+        parent = childLoop;
+        innermostLoopEval = childLoop;
+      }
+
+      // Track sunk evaluations (avoid double-lowering)
+      llvm::SmallPtrSet<const Fortran::lower::pft::Evaluation *, 16> sunk;
+      for (auto *e : prologue)
+        sunk.insert(e);
+      for (auto *e : epilogue)
+        sunk.insert(e);
+
+      auto sink =
+          [&](llvm::SmallVector<Fortran::lower::pft::Evaluation *> &lst) {
+            for (auto *e : lst)
+              genFIR(*e);
+          };
+
+      sink(prologue);
+
+      // Lower innermost loop body, skipping sunk
+      for (Fortran::lower::pft::Evaluation &e :
+           innermostLoopEval->getNestedEvaluations())
+        if (!sunk.contains(&e))
+          genFIR(e);
+
+      sink(epilogue);
+    } else {
+      // Normal lowering
+      for (Fortran::lower::pft::Evaluation &e : curEval->getNestedEvaluations())
+        genFIR(e);
+    }
     localSymbols.popScope();
     builder->restoreInsertionPoint(insertPt);
 
diff --git a/flang/lib/Lower/OpenACC.cpp b/flang/lib/Lower/OpenACC.cpp
index 742f58f..62e5c0c 100644
--- a/flang/lib/Lower/OpenACC.cpp
+++ b/flang/lib/Lower/OpenACC.cpp
@@ -2178,11 +2178,25 @@ static void processDoLoopBounds(
         locs.push_back(converter.genLocation(
             Fortran::parser::FindSourceLocation(outerDoConstruct)));
       } else {
-        auto *doCons = crtEval->getIf<Fortran::parser::DoConstruct>();
-        assert(doCons && "expect do construct");
-        loopControl = &*doCons->GetLoopControl();
+        // Safely locate the next inner DoConstruct within this eval.
+        const Fortran::parser::DoConstruct *innerDo = nullptr;
+        if (crtEval && crtEval->hasNestedEvaluations()) {
+          for (Fortran::lower::pft::Evaluation &child :
+               crtEval->getNestedEvaluations()) {
+            if (auto *stmt = child.getIf<Fortran::parser::DoConstruct>()) {
+              innerDo = stmt;
+              // Prepare to descend for the next iteration
+              crtEval = &child;
+              break;
+            }
+          }
+        }
+        if (!innerDo)
+          break; // No deeper loop; stop collecting collapsed bounds.
+
+        loopControl = &*innerDo->GetLoopControl();
         locs.push_back(converter.genLocation(
-            Fortran::parser::FindSourceLocation(*doCons)));
+            Fortran::parser::FindSourceLocation(*innerDo)));
       }
 
       const Fortran::parser::LoopControl::Bounds *bounds =
@@ -2206,8 +2220,7 @@ static void processDoLoopBounds(
 
       inclusiveBounds.push_back(true);
 
-      if (i < loopsToProcess - 1)
-        crtEval = &*std::next(crtEval->getNestedEvaluations().begin());
+      // crtEval already updated when descending; no blind increment here.
     }
   }
 }
@@ -2553,10 +2566,6 @@ static mlir::acc::LoopOp createLoopOp(
                    std::get_if<Fortran::parser::AccClause::Collapse>(
                        &clause.u)) {
       const Fortran::parser::AccCollapseArg &arg = collapseClause->v;
-      const auto &force = std::get<bool>(arg.t);
-      if (force)
-        TODO(clauseLocation, "OpenACC collapse force modifier");
-
       const auto &intExpr =
           std::get<Fortran::parser::ScalarIntConstantExpr>(arg.t);
       const auto *expr = Fortran::semantics::GetExpr(intExpr);
@@ -5029,25 +5038,34 @@ void Fortran::lower::genEarlyReturnInOpenACCLoop(fir::FirOpBuilder &builder,
 
 uint64_t Fortran::lower::getLoopCountForCollapseAndTile(
     const Fortran::parser::AccClauseList &clauseList) {
-  uint64_t collapseLoopCount = 1;
+  uint64_t collapseLoopCount = getCollapseSizeAndForce(clauseList).first;
   uint64_t tileLoopCount = 1;
   for (const Fortran::parser::AccClause &clause : clauseList.v) {
-    if (const auto *collapseClause =
-            std::get_if<Fortran::parser::AccClause::Collapse>(&clause.u)) {
-      const parser::AccCollapseArg &arg = collapseClause->v;
-      const auto &collapseValue{std::get<parser::ScalarIntConstantExpr>(arg.t)};
-      collapseLoopCount = *Fortran::semantics::GetIntValue(collapseValue);
-    }
     if (const auto *tileClause =
             std::get_if<Fortran::parser::AccClause::Tile>(&clause.u)) {
       const parser::AccTileExprList &tileExprList = tileClause->v;
-      const std::list<parser::AccTileExpr> &listTileExpr = tileExprList.v;
-      tileLoopCount = listTileExpr.size();
+      tileLoopCount = tileExprList.v.size();
+    }
+  }
+  return tileLoopCount > collapseLoopCount ? tileLoopCount : collapseLoopCount;
+}
+
+std::pair<uint64_t, bool> Fortran::lower::getCollapseSizeAndForce(
+    const Fortran::parser::AccClauseList &clauseList) {
+  uint64_t size = 1;
+  bool force = false;
+  for (const Fortran::parser::AccClause &clause : clauseList.v) {
+    if (const auto *collapseClause =
+            std::get_if<Fortran::parser::AccClause::Collapse>(&clause.u)) {
+      const Fortran::parser::AccCollapseArg &arg = collapseClause->v;
+      force = std::get<bool>(arg.t);
+      const auto &collapseValue =
+          std::get<Fortran::parser::ScalarIntConstantExpr>(arg.t);
+      size = *Fortran::semantics::GetIntValue(collapseValue);
+      break;
     }
   }
-  if (tileLoopCount > collapseLoopCount)
-    return tileLoopCount;
-  return collapseLoopCount;
+  return {size, force};
 }
 
 /// Create an ACC loop operation for a DO construct when inside ACC compute
diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp
index 9e56c2b..bd94651 100644
--- a/flang/lib/Lower/OpenMP/OpenMP.cpp
+++ b/flang/lib/Lower/OpenMP/OpenMP.cpp
@@ -153,6 +153,7 @@ public:
     clauseOps.loopLowerBounds = ops.loopLowerBounds;
     clauseOps.loopUpperBounds = ops.loopUpperBounds;
     clauseOps.loopSteps = ops.loopSteps;
+    clauseOps.collapseNumLoops = ops.collapseNumLoops;
     ivOut.append(iv);
     return true;
   }
diff --git a/flang/lib/Optimizer/OpenMP/MapInfoFinalization.cpp b/flang/lib/Optimizer/OpenMP/MapInfoFinalization.cpp
index e595e61..260e525 100644
--- a/flang/lib/Optimizer/OpenMP/MapInfoFinalization.cpp
+++ b/flang/lib/Optimizer/OpenMP/MapInfoFinalization.cpp
@@ -77,6 +77,10 @@ class MapInfoFinalizationPass
   ///      |                  |
   std::map<mlir::Operation *, mlir::Value> localBoxAllocas;
 
+  // List of deferrable descriptors to process at the end of
+  // the pass.
+  llvm::SmallVector<mlir::Operation *> deferrableDesc;
+
   /// Return true if the given path exists in a list of paths.
   static bool
   containsPath(const llvm::SmallVectorImpl<llvm::SmallVector<int64_t>> &paths,
@@ -183,6 +187,40 @@ class MapInfoFinalizationPass
     newMemberIndexPaths.emplace_back(indexPath.begin(), indexPath.end());
   }
 
+  // Check if the declaration operation we have refers to a dummy
+  // function argument.
+  bool isDummyArgument(mlir::Value mappedValue) {
+    if (auto declareOp = mlir::dyn_cast_if_present<hlfir::DeclareOp>(
+            mappedValue.getDefiningOp()))
+      if (auto dummyScope = declareOp.getDummyScope())
+        return true;
+    return false;
+  }
+
+  // Relevant for OpenMP < 5.2, where attach semantics and rules don't exist.
+  // As descriptors were an unspoken implementation detail in these versions
+  // there's certain cases where the user (and the compiler implementation)
+  // can create data mapping errors by having temporary descriptors stuck
+  // in memory. The main example is calling an 'target enter data map'
+  // without a corresponding exit on an assumed shape or size dummy
+  // argument, a local stack descriptor is generated, gets mapped and
+  // is then left on device. A user doesn't realize what they've done as
+  // the OpenMP specification isn't explicit on descriptor handling in
+  // earlier versions and as far as Fortran is concerned this si something
+  // hidden from a user. To avoid this we can defer the descriptor mapping
+  // in these cases until target or target data regions, when we can be
+  // sure they have a clear limited scope on device.
+  bool canDeferDescriptorMapping(mlir::Value descriptor) {
+    if (fir::isAllocatableType(descriptor.getType()) ||
+        fir::isPointerType(descriptor.getType()))
+      return false;
+    if (isDummyArgument(descriptor) &&
+        (fir::isAssumedType(descriptor.getType()) ||
+         fir::isAssumedShape(descriptor.getType())))
+      return true;
+    return false;
+  }
+
   /// getMemberUserList gathers all users of a particular MapInfoOp that are
   /// other MapInfoOp's and places them into the mapMemberUsers list, which
   /// records the map that the current argument MapInfoOp "op" is part of
@@ -234,13 +272,16 @@ class MapInfoFinalizationPass
   /// fir::BoxOffsetOp we utilise to access the descriptor datas
   /// base address can be utilised.
   mlir::Value getDescriptorFromBoxMap(mlir::omp::MapInfoOp boxMap,
-                                      fir::FirOpBuilder &builder) {
+                                      fir::FirOpBuilder &builder,
+                                      bool &canDescBeDeferred) {
     mlir::Value descriptor = boxMap.getVarPtr();
     if (!fir::isTypeWithDescriptor(boxMap.getVarType()))
       if (auto addrOp = mlir::dyn_cast_if_present<fir::BoxAddrOp>(
               boxMap.getVarPtr().getDefiningOp()))
         descriptor = addrOp.getVal();
 
+    canDescBeDeferred = canDeferDescriptorMapping(descriptor);
+
     if (!mlir::isa<fir::BaseBoxType>(descriptor.getType()) &&
         !fir::factory::isOptionalArgument(descriptor.getDefiningOp()))
       return descriptor;
@@ -391,8 +432,7 @@ class MapInfoFinalizationPass
 
   /// Check if the mapOp is present in the HasDeviceAddr clause on
   /// the userOp. Only applies to TargetOp.
-  bool isHasDeviceAddr(mlir::omp::MapInfoOp mapOp, mlir::Operation *userOp) {
-    assert(userOp && "Expecting non-null argument");
+  bool isHasDeviceAddr(mlir::omp::MapInfoOp mapOp, mlir::Operation &userOp) {
     if (auto targetOp = llvm::dyn_cast<mlir::omp::TargetOp>(userOp)) {
       for (mlir::Value hda : targetOp.getHasDeviceAddrVars()) {
         if (hda.getDefiningOp() == mapOp)
@@ -402,6 +442,26 @@ class MapInfoFinalizationPass
     return false;
   }
 
+  bool isUseDeviceAddr(mlir::omp::MapInfoOp mapOp, mlir::Operation &userOp) {
+    if (auto targetDataOp = llvm::dyn_cast<mlir::omp::TargetDataOp>(userOp)) {
+      for (mlir::Value uda : targetDataOp.getUseDeviceAddrVars()) {
+        if (uda.getDefiningOp() == mapOp)
+          return true;
+      }
+    }
+    return false;
+  }
+
+  bool isUseDevicePtr(mlir::omp::MapInfoOp mapOp, mlir::Operation &userOp) {
+    if (auto targetDataOp = llvm::dyn_cast<mlir::omp::TargetDataOp>(userOp)) {
+      for (mlir::Value udp : targetDataOp.getUseDevicePtrVars()) {
+        if (udp.getDefiningOp() == mapOp)
+          return true;
+      }
+    }
+    return false;
+  }
+
   mlir::omp::MapInfoOp genBoxcharMemberMap(mlir::omp::MapInfoOp op,
                                            fir::FirOpBuilder &builder) {
     if (!op.getMembers().empty())
@@ -466,12 +526,14 @@ class MapInfoFinalizationPass
 
     // TODO: map the addendum segment of the descriptor, similarly to the
     // base address/data pointer member.
-    mlir::Value descriptor = getDescriptorFromBoxMap(op, builder);
+    bool descCanBeDeferred = false;
+    mlir::Value descriptor =
+        getDescriptorFromBoxMap(op, builder, descCanBeDeferred);
 
     mlir::ArrayAttr newMembersAttr;
     mlir::SmallVector<mlir::Value> newMembers;
     llvm::SmallVector<llvm::SmallVector<int64_t>> memberIndices;
-    bool isHasDeviceAddrFlag = isHasDeviceAddr(op, target);
+    bool isHasDeviceAddrFlag = isHasDeviceAddr(op, *target);
 
     if (!mapMemberUsers.empty() || !op.getMembers().empty())
       getMemberIndicesAsVectors(
@@ -553,6 +615,10 @@ class MapInfoFinalizationPass
         /*partial_map=*/builder.getBoolAttr(false));
     op.replaceAllUsesWith(newDescParentMapOp.getResult());
     op->erase();
+
+    if (descCanBeDeferred)
+      deferrableDesc.push_back(newDescParentMapOp);
+
     return newDescParentMapOp;
   }
 
@@ -701,6 +767,124 @@ class MapInfoFinalizationPass
     return nullptr;
   }
 
+  void addImplicitDescriptorMapToTargetDataOp(mlir::omp::MapInfoOp op,
+                                              fir::FirOpBuilder &builder,
+                                              mlir::Operation &target) {
+    // Checks if the map is present as an explicit map already on the target
+    // data directive, and not just present on a use_device_addr/ptr, as if
+    // that's the case, we should not need to add an implicit map for the
+    // descriptor.
+    auto explicitMappingPresent = [](mlir::omp::MapInfoOp op,
+                                     mlir::omp::TargetDataOp tarData) {
+      // Verify top-level descriptor mapping is at least equal with same
+      // varPtr, the map type should always be To for a descriptor, which is
+      // all we really care about for this mapping as we aim to make sure the
+      // descriptor is always present on device if we're expecting to access
+      // the underlying data.
+      if (tarData.getMapVars().empty())
+        return false;
+
+      for (mlir::Value mapVar : tarData.getMapVars()) {
+        auto mapOp = llvm::cast<mlir::omp::MapInfoOp>(mapVar.getDefiningOp());
+        if (mapOp.getVarPtr() == op.getVarPtr() &&
+            mapOp.getVarPtrPtr() == op.getVarPtrPtr()) {
+          return true;
+        }
+      }
+
+      return false;
+    };
+
+    // if we're not a top level descriptor with members (e.g. member of a
+    // derived type), we do not want to perform this step.
+    if (!llvm::isa<mlir::omp::TargetDataOp>(target) || op.getMembers().empty())
+      return;
+
+    if (!isUseDeviceAddr(op, target) && !isUseDevicePtr(op, target))
+      return;
+
+    auto targetDataOp = llvm::cast<mlir::omp::TargetDataOp>(target);
+    if (explicitMappingPresent(op, targetDataOp))
+      return;
+
+    mlir::omp::MapInfoOp newDescParentMapOp =
+        builder.create<mlir::omp::MapInfoOp>(
+            op->getLoc(), op.getResult().getType(), op.getVarPtr(),
+            op.getVarTypeAttr(),
+            builder.getIntegerAttr(
+                builder.getIntegerType(64, false),
+                llvm::to_underlying(
+                    llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_TO |
+                    llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_ALWAYS)),
+            op.getMapCaptureTypeAttr(), /*varPtrPtr=*/mlir::Value{},
+            mlir::SmallVector<mlir::Value>{}, mlir::ArrayAttr{},
+            /*bounds=*/mlir::SmallVector<mlir::Value>{},
+            /*mapperId*/ mlir::FlatSymbolRefAttr(), op.getNameAttr(),
+            /*partial_map=*/builder.getBoolAttr(false));
+
+    targetDataOp.getMapVarsMutable().append({newDescParentMapOp});
+  }
+
+  void removeTopLevelDescriptor(mlir::omp::MapInfoOp op,
+                                fir::FirOpBuilder &builder,
+                                mlir::Operation *target) {
+    if (llvm::isa<mlir::omp::TargetOp, mlir::omp::TargetDataOp,
+                  mlir::omp::DeclareMapperInfoOp>(target))
+      return;
+
+    // if we're not a top level descriptor with members (e.g. member of a
+    // derived type), we do not want to perform this step.
+    if (op.getMembers().empty())
+      return;
+
+    mlir::SmallVector<mlir::Value> members = op.getMembers();
+    mlir::omp::MapInfoOp baseAddr =
+        mlir::dyn_cast_or_null<mlir::omp::MapInfoOp>(
+            members.front().getDefiningOp());
+    assert(baseAddr && "Expected member to be MapInfoOp");
+    members.erase(members.begin());
+
+    llvm::SmallVector<llvm::SmallVector<int64_t>> memberIndices;
+    getMemberIndicesAsVectors(op, memberIndices);
+
+    // Can skip the extra processing if there's only 1 member as it'd
+    // be the base addresses, which we're promoting to the parent.
+    mlir::ArrayAttr membersAttr;
+    if (memberIndices.size() > 1) {
+      memberIndices.erase(memberIndices.begin());
+      membersAttr = builder.create2DI64ArrayAttr(memberIndices);
+    }
+
+    // VarPtrPtr is tied to detecting if something is a pointer in the later
+    // lowering currently, this at the moment comes tied with
+    // OMP_MAP_PTR_AND_OBJ being applied which breaks the problem this tries to
+    // solve by emitting a 8-byte mapping tied to the descriptor address (even
+    // if we only emit a single map). So we circumvent this by removing the
+    // varPtrPtr mapping, however, a side affect of this is we lose the
+    // additional load from the backend tied to this which is required for
+    // correctness and getting the correct address of the data to perform our
+    // mapping. So we do our load at this stage.
+    // TODO/FIXME: Tidy up the OMP_MAP_PTR_AND_OBJ and varPtrPtr being tied to
+    // if something is a pointer to try and tidy up the implementation a bit.
+    // This is an unfortunate complexity from push-back from upstream. We
+    // could also emit a load at this level for all base addresses as well,
+    // which in turn will simplify the later lowering a bit as well. But first
+    // need to see how well this alteration works.
+    auto loadBaseAddr =
+        builder.loadIfRef(op->getLoc(), baseAddr.getVarPtrPtr());
+    mlir::omp::MapInfoOp newBaseAddrMapOp =
+        builder.create<mlir::omp::MapInfoOp>(
+            op->getLoc(), loadBaseAddr.getType(), loadBaseAddr,
+            baseAddr.getVarTypeAttr(), baseAddr.getMapTypeAttr(),
+            baseAddr.getMapCaptureTypeAttr(), mlir::Value{}, members,
+            membersAttr, baseAddr.getBounds(),
+            /*mapperId*/ mlir::FlatSymbolRefAttr(), op.getNameAttr(),
+            /*partial_map=*/builder.getBoolAttr(false));
+    op.replaceAllUsesWith(newBaseAddrMapOp.getResult());
+    op->erase();
+    baseAddr.erase();
+  }
+
   // This pass executes on omp::MapInfoOp's containing descriptor based types
   // (allocatables, pointers, assumed shape etc.) and expanding them into
   // multiple omp::MapInfoOp's for each pointer member contained within the
@@ -730,6 +914,7 @@ class MapInfoFinalizationPass
       // clear all local allocations we made for any boxes in any prior
       // iterations from previous function scopes.
       localBoxAllocas.clear();
+      deferrableDesc.clear();
 
       // First, walk `omp.map.info` ops to see if any of them have varPtrs
       // with an underlying type of fir.char<k, ?>, i.e a character
@@ -1010,6 +1195,36 @@ class MapInfoFinalizationPass
         }
       });
 
+      // Now that we've expanded all of our boxes into a descriptor and base
+      // address map where necessary, we check if the map owner is an
+      // enter/exit/target data directive, and if they are we drop the initial
+      // descriptor (top-level parent) and replace it with the
+      // base_address/data.
+      //
+      // This circumvents issues with stack allocated descriptors bound to
+      // device colliding which in Flang is rather trivial for a user to do by
+      // accident due to the rather pervasive local intermediate descriptor
+      // generation that occurs whenever you pass boxes around different scopes.
+      // In OpenMP 6+ mapping these would be a user error as the tools required
+      // to circumvent these issues are provided by the spec (ref_ptr/ptee map
+      // types), but in prior specifications these tools are not available and
+      // it becomes an implementation issue for us to solve.
+      //
+      // We do this by dropping the top-level descriptor which will be the stack
+      // descriptor when we perform enter/exit maps, as we don't want these to
+      // be bound until necessary which is when we utilise the descriptor type
+      // within a target region. At which point we map the relevant descriptor
+      // data and the runtime should correctly associate the data with the
+      // descriptor and bind together and allow clean mapping and execution.
+      for (auto *op : deferrableDesc) {
+        auto mapOp = llvm::dyn_cast<mlir::omp::MapInfoOp>(op);
+        mlir::Operation *targetUser = getFirstTargetUser(mapOp);
+        assert(targetUser && "expected user of map operation was not found");
+        builder.setInsertionPoint(mapOp);
+        removeTopLevelDescriptor(mapOp, builder, targetUser);
+        addImplicitDescriptorMapToTargetDataOp(mapOp, builder, *targetUser);
+      }
+
       // Wait until after we have generated all of our maps to add them onto
       // the target's block arguments, simplifying the process as there would be
       // no need to avoid accidental duplicate additions.
diff --git a/flang/test/Lower/OpenACC/acc-loop-collapse-force-lowering.f90 b/flang/test/Lower/OpenACC/acc-loop-collapse-force-lowering.f90
new file mode 100644
index 0000000..ca932c1
--- /dev/null
+++ b/flang/test/Lower/OpenACC/acc-loop-collapse-force-lowering.f90
@@ -0,0 +1,41 @@
+! RUN: bbc -fopenacc -emit-hlfir %s -o - | FileCheck %s
+
+! Verify collapse(force:2) sinks prologue (between loops) and epilogue (after inner loop)
+! into the acc.loop region body.
+
+subroutine collapse_force_sink(n, m)
+  integer, intent(in) :: n, m
+  real, dimension(n,m) :: a
+  real, dimension(n) :: bb, cc
+  integer :: i, j
+
+  !$acc parallel loop collapse(force:2)
+  do i = 1, n
+    bb(i) = 4.2     ! prologue (between loops)
+    do j = 1, m
+      a(i,j) = a(i,j) + 2.0
+    end do
+    cc(i) = 7.3     ! epilogue (after inner loop)
+  end do
+  !$acc end parallel loop
+end subroutine
+
+! CHECK: func.func @_QPcollapse_force_sink(
+! CHECK: acc.parallel
+! Ensure outer acc.loop is combined(parallel)
+! CHECK: acc.loop combined(parallel)
+! Prologue: constant 4.2 and an assign before inner loop
+! CHECK: arith.constant 4.200000e+00
+! CHECK: hlfir.assign
+! Inner loop and its body include 2.0 add and an assign
+! CHECK: acc.loop
+! CHECK: arith.constant 2.000000e+00
+! CHECK: arith.addf
+! CHECK: hlfir.assign
+! Epilogue: constant 7.3 and an assign after inner loop
+! CHECK: arith.constant 7.300000e+00
+! CHECK: hlfir.assign
+! And the outer acc.loop has collapse = [2]
+! CHECK: } attributes {collapse = [2]
+
+
diff --git a/flang/test/Lower/OpenMP/distribute-parallel-do-simd.f90 b/flang/test/Lower/OpenMP/distribute-parallel-do-simd.f90
index 142bc02a..c769152 100644
--- a/flang/test/Lower/OpenMP/distribute-parallel-do-simd.f90
+++ b/flang/test/Lower/OpenMP/distribute-parallel-do-simd.f90
@@ -112,7 +112,7 @@ integer :: i,j
 ! CHECK:                 omp.distribute {
 ! CHECK:                   omp.wsloop {
 ! CHECK:                     omp.simd private({{.*}}) {
-! CHECK:                       omp.loop_nest (%[[I_IV:.*]], %[[J_IV:.*]]) : i32 = ({{.*}}) to ({{.*}}) inclusive step ({{.*}}) {
+! CHECK:                       omp.loop_nest (%[[I_IV:.*]], %[[J_IV:.*]]) : i32 = ({{.*}}) to ({{.*}}) inclusive step ({{.*}}) collapse(2) {
 ! CHECK:                         %[[Y_MAX_PRIV:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "{{.*}}y_max"}
 
 ! CHECK:                         %[[I_UB:.*]] = fir.load %[[X_MAX_MAPPED]]#0 : !fir.ref<i32>
diff --git a/flang/test/Lower/OpenMP/map-descriptor-deferral.f90 b/flang/test/Lower/OpenMP/map-descriptor-deferral.f90
new file mode 100644
index 0000000..daea2f3
--- /dev/null
+++ b/flang/test/Lower/OpenMP/map-descriptor-deferral.f90
@@ -0,0 +1,96 @@
+!RUN: %flang_fc1 -emit-hlfir -fopenmp %s -o - | FileCheck %s
+
+! This test checks that the descriptor deferral behaviour of the
+! MapInfoFinalization pass is preserved. Descriptor deferral is the
+! act of removing the mapping of the descriptor in certain cases when
+! a descriptor carrying type is mapped. This only applies in certain
+! cases and to assumed shape and size dummy arguments that are not
+! allocatable or pointers.
+
+subroutine assume_map_target_enter_exit(assumed_arr)
+    integer :: assumed_arr(:)
+    !$omp target enter data map(to: assumed_arr)
+    !$omp target
+        assumed_arr(1) = 10
+    !$omp end target
+    !$omp target exit data map(from: assumed_arr)
+end subroutine
+
+!CHECK-LABEL:   func.func @_QPassume_map_target_enter_exit(
+!CHECK:    %[[BOX_ADDR:.*]] = fir.box_offset %{{.*}} base_addr : (!fir.ref<!fir.box<!fir.array<?xi32>>>) -> !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>
+!CHECK:    %[[LOAD_BOX:.*]] = fir.load %[[BOX_ADDR]] : !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>
+!CHECK:    %[[MAP_ADDR:.*]] = omp.map.info var_ptr(%[[LOAD_BOX]] : !fir.ref<!fir.array<?xi32>>, i32) map_clauses(to) capture(ByRef) bounds(%{{.*}}) -> !fir.ref<!fir.array<?xi32>> {name = "assumed_arr"}
+!CHECK:    omp.target_enter_data map_entries(%[[MAP_ADDR]] : !fir.ref<!fir.array<?xi32>>)
+!CHECK:    %[[BOX_ADDR:.*]] = fir.box_offset %{{.*}} base_addr : (!fir.ref<!fir.box<!fir.array<?xi32>>>) -> !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>
+!CHECK:    %[[MAP_ADDR:.*]] = omp.map.info var_ptr(%{{.*}} : !fir.ref<!fir.box<!fir.array<?xi32>>>, i32) map_clauses(implicit, tofrom) capture(ByRef) var_ptr_ptr(%[[BOX_ADDR]] : !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>) bounds(%{{.*}}) -> !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>> {name = ""}
+!CHECK:    %[[MAP_BOX:.*]] = omp.map.info var_ptr(%{{.*}} : !fir.ref<!fir.box<!fir.array<?xi32>>>, !fir.box<!fir.array<?xi32>>) map_clauses(implicit, to) capture(ByRef) members(%{{.*}} : [0] : !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>) -> !fir.ref<!fir.array<?xi32>> {name = "assumed_arr"}
+!CHECK:    omp.target map_entries(%[[MAP_BOX]] -> %{{.*}}, %[[MAP_ADDR]] -> %{{.*}} : !fir.ref<!fir.array<?xi32>>, !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>) {
+!CHECK:    %[[BOX_ADDR:.*]] = fir.box_offset %{{.*}} base_addr : (!fir.ref<!fir.box<!fir.array<?xi32>>>) -> !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>
+!CHECK:    %[[LOAD_BOX:.*]] = fir.load %[[BOX_ADDR]] : !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>
+!CHECK:    %[[MAP_ADDR:.*]] = omp.map.info var_ptr(%[[LOAD_BOX]] : !fir.ref<!fir.array<?xi32>>, i32) map_clauses(from) capture(ByRef) bounds(%{{.*}}) -> !fir.ref<!fir.array<?xi32>> {name = "assumed_arr"}
+!CHECK:    omp.target_exit_data map_entries(%[[MAP_ADDR]] : !fir.ref<!fir.array<?xi32>>)
+
+subroutine assume_alloca_map_target_enter_exit(assumed_arr)
+    integer, allocatable :: assumed_arr(:)
+    !$omp target enter data map(to: assumed_arr)
+    !$omp target
+        assumed_arr(1) = 10
+    !$omp end target
+    !$omp target exit data map(from: assumed_arr)
+end subroutine
+
+!CHECK-LABEL:   func.func @_QPassume_alloca_map_target_enter_exit(
+!CHECK:    %[[BOX_ADDR:.*]] = fir.box_offset %{{.*}} base_addr : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>
+!CHECK:    %[[BOX_ADDR_MAP:.*]] = omp.map.info var_ptr(%{{.*}} : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, i32) map_clauses(to) capture(ByRef) var_ptr_ptr(%[[BOX_ADDR]] : !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>) bounds(%{{.*}}) -> !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>> {name = ""}
+!CHECK:    %[[DESC_MAP:.*]] = omp.map.info var_ptr(%{{.*}} : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.box<!fir.heap<!fir.array<?xi32>>>) map_clauses(to) capture(ByRef) members(%[[BOX_ADDR_MAP]] : [0] : !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>> {name = "assumed_arr"}
+!CHECK:    omp.target_enter_data map_entries(%[[DESC_MAP]], %[[BOX_ADDR_MAP]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>)
+!CHECK:    %[[BOX_ADDR:.*]] = fir.box_offset %{{.*}} base_addr : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>
+!CHECK:    %[[BOX_ADDR_MAP:.*]] = omp.map.info var_ptr(%{{.*}} : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, i32) map_clauses(implicit, tofrom) capture(ByRef) var_ptr_ptr(%[[BOX_ADDR]] : !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>) bounds(%{{.*}}) -> !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>> {name = ""}
+!CHECK:    %[[DESC_MAP:.*]] = omp.map.info var_ptr(%{{.*}} : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.box<!fir.heap<!fir.array<?xi32>>>) map_clauses(implicit, to) capture(ByRef) members(%[[BOX_ADDR_MAP]] : [0] : !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>> {name = "assumed_arr"}
+!CHECK:    omp.target map_entries(%[[DESC_MAP]] -> %[[VAL_28:.*]], %[[BOX_ADDR_MAP]] -> %[[VAL_29:.*]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>) {
+!CHECK:    %[[BOX_ADDR:.*]] = fir.box_offset %{{.*}} base_addr : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>
+!CHECK:    %[[BOX_ADDR_MAP:.*]] = omp.map.info var_ptr(%{{.*}} : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, i32) map_clauses(from) capture(ByRef) var_ptr_ptr(%[[BOX_ADDR]] : !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>) bounds(%{{.*}}) -> !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>> {name = ""}
+!CHECK:    %[[DESC_MAP:.*]] = omp.map.info var_ptr(%{{.*}} : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.box<!fir.heap<!fir.array<?xi32>>>) map_clauses(from) capture(ByRef) members(%[[BOX_ADDR_MAP]] : [0] : !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>> {name = "assumed_arr"}
+!CHECK:    omp.target_exit_data map_entries(%[[DESC_MAP]], %[[BOX_ADDR_MAP]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>)
+
+subroutine assume_pointer_map_target_enter_exit(assumed_arr)
+    integer, pointer :: assumed_arr(:)
+    !$omp target enter data map(to: assumed_arr)
+    !$omp target
+        assumed_arr(1) = 10
+    !$omp end target
+    !$omp target exit data map(from: assumed_arr)
+end subroutine
+
+!CHECK-LABEL:   func.func @_QPassume_pointer_map_target_enter_exit(
+!CHECK:    %[[BOX_ADDR:.*]] = fir.box_offset %{{.*}} base_addr : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>) -> !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>
+!CHECK:    %[[BOX_ADDR_MAP:.*]] = omp.map.info var_ptr(%{{.*}} : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>, i32) map_clauses(to) capture(ByRef) var_ptr_ptr(%[[BOX_ADDR]] : !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>) bounds(%{{.*}}) -> !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>> {name = ""}
+!CHECK:    %[[DESC_MAP:.*]] = omp.map.info var_ptr(%{{.*}} : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>, !fir.box<!fir.ptr<!fir.array<?xi32>>>) map_clauses(to) capture(ByRef) members(%[[BOX_ADDR_MAP]] : [0] : !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>) -> !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>> {name = "assumed_arr"}
+!CHECK:    omp.target_enter_data map_entries(%[[DESC_MAP]], %[[BOX_ADDR_MAP]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>, !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>)
+!CHECK:    %[[BOX_ADDR:.*]] = fir.box_offset %{{.*}} base_addr : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>) -> !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>
+!CHECK:    %[[BOX_ADDR_MAP:.*]] = omp.map.info var_ptr(%{{.*}} : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>, i32) map_clauses(implicit, tofrom) capture(ByRef) var_ptr_ptr(%[[BOX_ADDR]] : !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>) bounds(%{{.*}}) -> !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>> {name = ""}
+!CHECK:    %[[DESC_MAP:.*]] = omp.map.info var_ptr(%{{.*}} : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>, !fir.box<!fir.ptr<!fir.array<?xi32>>>) map_clauses(implicit, to) capture(ByRef) members(%[[BOX_ADDR_MAP]] : [0] : !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>) -> !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>> {name = "assumed_arr"}
+!CHECK:    omp.target map_entries(%[[DESC_MAP]] -> %[[VAL_28:.*]], %[[BOX_ADDR_MAP]] -> %[[VAL_29:.*]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>, !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>) {
+!CHECK:    %[[BOX_ADDR:.*]] = fir.box_offset %{{.*}} base_addr : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>) -> !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>
+!CHECK:    %[[BOX_ADDR_MAP:.*]] = omp.map.info var_ptr(%{{.*}} : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>, i32) map_clauses(from) capture(ByRef) var_ptr_ptr(%[[BOX_ADDR]] : !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>) bounds(%{{.*}}) -> !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>> {name = ""}
+!CHECK:    %[[DESC_MAP:.*]] = omp.map.info var_ptr(%{{.*}} : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>, !fir.box<!fir.ptr<!fir.array<?xi32>>>) map_clauses(from) capture(ByRef) members(%[[BOX_ADDR_MAP]] : [0] : !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>) -> !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>> {name = "assumed_arr"}
+!CHECK:    omp.target_exit_data map_entries(%[[DESC_MAP]], %[[BOX_ADDR_MAP]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>, !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>)
+
+subroutine assume_map_target_data(assumed_arr)
+    integer :: assumed_arr(:)
+    !$omp target data map(to: assumed_arr)
+    !$omp target
+        assumed_arr(1) = 10
+    !$omp end target
+    !$omp end target data
+end subroutine
+
+!CHECK-LABEL:   func.func @_QPassume_map_target_data(
+!CHECK:    %[[BOX_ADDR:.*]] = fir.box_offset %{{.*}} base_addr : (!fir.ref<!fir.box<!fir.array<?xi32>>>) -> !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>
+!CHECK:    %[[MAP_ADDR:.*]] = omp.map.info var_ptr(%{{.*}} : !fir.ref<!fir.box<!fir.array<?xi32>>>, i32) map_clauses(to) capture(ByRef) var_ptr_ptr(%[[BOX_ADDR]] : !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>) bounds(%{{.*}}) -> !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>> {name = ""}
+!CHECK:    %[[MAP_BOX:.*]] = omp.map.info var_ptr(%{{.*}} : !fir.ref<!fir.box<!fir.array<?xi32>>>, !fir.box<!fir.array<?xi32>>) map_clauses(to) capture(ByRef) members(%[[MAP_ADDR]] : [0] : !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>) -> !fir.ref<!fir.array<?xi32>> {name = "assumed_arr"}
+!CHECK:    omp.target_data map_entries(%[[MAP_BOX]], %[[MAP_ADDR]] : !fir.ref<!fir.array<?xi32>>, !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>) {
+!CHECK:    %[[BOX_ADDR:.*]] = fir.box_offset %{{.*}} base_addr : (!fir.ref<!fir.box<!fir.array<?xi32>>>) -> !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>
+!CHECK:    %[[MAP_ADDR:.*]] = omp.map.info var_ptr(%{{.*}} : !fir.ref<!fir.box<!fir.array<?xi32>>>, i32) map_clauses(implicit, tofrom) capture(ByRef) var_ptr_ptr(%[[BOX_ADDR]] : !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>) bounds(%{{.*}}) -> !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>> {name = ""}
+!CHECK:    %[[MAP_BOX:.*]] = omp.map.info var_ptr(%{{.*}} : !fir.ref<!fir.box<!fir.array<?xi32>>>, !fir.box<!fir.array<?xi32>>) map_clauses(implicit, to) capture(ByRef) members(%[[MAP_ADDR]] : [0] : !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>) -> !fir.ref<!fir.array<?xi32>> {name = "assumed_arr"}
+!CHECK:    omp.target map_entries(%[[MAP_BOX]] -> %{{.*}}, %[[MAP_ADDR]] -> %{{.*}} : !fir.ref<!fir.array<?xi32>>, !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>) {
diff --git a/flang/test/Lower/generic-shadows-specific.F90 b/flang/test/Lower/generic-shadows-specific.F90
new file mode 100644
index 0000000..e721908
--- /dev/null
+++ b/flang/test/Lower/generic-shadows-specific.F90
@@ -0,0 +1,40 @@
+
+#if STEP == 1
+! these modules must be read from  module files
+module generic_shadows_specific_m1
+  interface f ! reference must be to generic
+    module procedure f ! must have same name as generic interface
+  end interface
+ contains
+  character function f() ! must be character
+    f = 'q'
+  end
+end
+module generic_shadows_specific_m2
+   use generic_shadows_specific_m1
+end
+module generic_shadows_specific_m3
+   use generic_shadows_specific_m2 ! must be generic_shadows_specific_m2, not generic_shadows_specific_m1
+ contains
+   subroutine mustExist() ! not called, but must exist
+     character x
+     x = f()
+   end
+end
+
+#else
+! Check that expected code produced with no crash.
+subroutine reproducer()
+  use generic_shadows_specific_m2
+  use generic_shadows_specific_m3
+  character x
+  x = f()
+end
+#endif
+
+!RUN: rm -rf %t && mkdir -p %t
+!RUN: %flang_fc1 -fsyntax-only -DSTEP=1 -J%t %s
+!RUN: %flang_fc1 -emit-fir -J%t -o - %s | FileCheck %s
+
+!CHECK-LABEL: func.func @_QPreproducer
+!CHECK: fir.call @_QMgeneric_shadows_specific_m1Pf
diff --git a/flang/test/Transforms/omp-map-info-finalization.fir b/flang/test/Transforms/omp-map-info-finalization.fir
index ed814cd..7bc0ae4 100644
--- a/flang/test/Transforms/omp-map-info-finalization.fir
+++ b/flang/test/Transforms/omp-map-info-finalization.fir
@@ -326,15 +326,15 @@ func.func @_QPreuse_alloca(%arg0: !fir.box<!fir.array<?xf64>> {fir.bindc_name =
 // CHECK:       %{{[0-9]+}} = omp.map.info var_ptr(%[[ALLOCA]]
 // CHECK:       %{{[0-9]+}} = omp.map.info var_ptr(%[[ALLOCA]]
 // CHECK:       omp.target_data map_entries
-// CHECK:         %{{[0-9]+}} = omp.map.info var_ptr(%[[ALLOCA]]
-// CHECK:         %{{[0-9]+}} = omp.map.info var_ptr(%[[ALLOCA]]
+// CHECK:         %[[BOX_OFFSET:.*]] = fir.box_offset %[[ALLOCA]]
+// CHECK:         %[[LOAD_OFFSET:.*]] = fir.load %[[BOX_OFFSET]] : !fir.llvm_ptr<!fir.ref<!fir.array<?xf64>>>
+// CHECK:         %{{[0-9]+}} = omp.map.info var_ptr(%[[LOAD_OFFSET]]
 // CHECK:         omp.target_update map_entries
 // CHECK:         omp.terminator
 // CHECK:       }
 // CHECK:       return
 
 
-
 omp.private {type = firstprivate} @boxchar.privatizer : !fir.boxchar<1> copy {
   ^bb0(%arg0: !fir.boxchar<1>, %arg1: !fir.boxchar<1>):
     omp.yield(%arg0 : !fir.boxchar<1>)
diff --git a/libc/docs/build_and_test.rst b/libc/docs/build_and_test.rst
index dfdd965..d608591 100644
--- a/libc/docs/build_and_test.rst
+++ b/libc/docs/build_and_test.rst
@@ -47,12 +47,18 @@ and put the following in your settings.json file:
 .. code-block:: javascript
 
    {
-     "cmake.sourceDirectory": "${workspaceFolder}/llvm",
+     "cmake.sourceDirectory": "${workspaceFolder}/runtimes",
      "cmake.configureSettings": {
-         "LLVM_ENABLE_PROJECTS" : "libc",
-         "LLVM_LIBC_FULL_BUILD" : true,
-         "LLVM_ENABLE_SPHINX" : true,
-         "LIBC_INCLUDE_DOCS" : true
+       "LLVM_ENABLE_RUNTIMES" : ["libc", "compiler-rt"],
+       "LLVM_LIBC_FULL_BUILD" : true,
+       "LLVM_ENABLE_SPHINX" : true,
+       "LIBC_INCLUDE_DOCS" : true,
+       "LLVM_LIBC_INCLUDE_SCUDO" : true,
+       "COMPILER_RT_BUILD_SCUDO_STANDALONE_WITH_LLVM_LIBC": true,
+       "COMPILER_RT_BUILD_GWP_ASAN" : false,
+       "COMPILER_RT_SCUDO_STANDALONE_BUILD_SHARED" : false,
+       "CMAKE_EXPORT_COMPILE_COMMANDS" : true,
+       "LIBC_CMAKE_VERBOSE_LOGGING" : true
      }
    }
 
diff --git a/libc/src/__support/OSUtil/linux/auxv.h b/libc/src/__support/OSUtil/linux/auxv.h
index 894868a..9c07c54 100644
--- a/libc/src/__support/OSUtil/linux/auxv.h
+++ b/libc/src/__support/OSUtil/linux/auxv.h
@@ -16,6 +16,7 @@
 
 #include <linux/auxvec.h> // For AT_ macros
 #include <linux/mman.h>   // For mmap flags
+#include <linux/param.h>  // For EXEC_PAGESIZE
 #include <linux/prctl.h>  // For prctl
 #include <sys/syscall.h>  // For syscall numbers
 
@@ -90,7 +91,7 @@ LIBC_INLINE void Vector::fallback_initialize_unsync() {
                                      PROT_READ | PROT_WRITE,
                                      MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
   // We do not proceed if mmap fails.
-  if (mmap_ret <= 0)
+  if (!linux_utils::is_valid_mmap(mmap_ret))
     return;
 
   // Initialize the auxv array with AT_NULL entries.
diff --git a/libc/src/__support/OSUtil/linux/syscall.h b/libc/src/__support/OSUtil/linux/syscall.h
index 24e0fca..0e25618 100644
--- a/libc/src/__support/OSUtil/linux/syscall.h
+++ b/libc/src/__support/OSUtil/linux/syscall.h
@@ -12,6 +12,7 @@
 #include "src/__support/CPP/bit.h"
 #include "src/__support/common.h"
 #include "src/__support/macros/config.h"
+#include "src/__support/macros/optimization.h"
 #include "src/__support/macros/properties/architectures.h"
 
 #ifdef LIBC_TARGET_ARCH_IS_X86_32
@@ -34,6 +35,19 @@ LIBC_INLINE R syscall_impl(long __number, Ts... ts) {
   return cpp::bit_or_static_cast<R>(syscall_impl(__number, (long)ts...));
 }
 
+// Linux-specific function for checking
+namespace linux_utils {
+LIBC_INLINE_VAR constexpr unsigned long MAX_ERRNO = 4095;
+// Ideally, this should be defined using PAGE_OFFSET
+// However, that is a configurable parameter. We mimic kernel's behavior
+// by checking against MAX_ERRNO.
+template <typename PointerLike>
+LIBC_INLINE constexpr bool is_valid_mmap(PointerLike ptr) {
+  long addr = cpp::bit_cast<long>(ptr);
+  return LIBC_LIKELY(addr > 0 || addr < -static_cast<long>(MAX_ERRNO));
+}
+} // namespace linux_utils
+
 } // namespace LIBC_NAMESPACE_DECL
 
 #endif // LLVM_LIBC_SRC___SUPPORT_OSUTIL_LINUX_SYSCALL_H
diff --git a/libc/src/__support/threads/linux/thread.cpp b/libc/src/__support/threads/linux/thread.cpp
index d9e479ee..4f1e911 100644
--- a/libc/src/__support/threads/linux/thread.cpp
+++ b/libc/src/__support/threads/linux/thread.cpp
@@ -100,7 +100,7 @@ LIBC_INLINE ErrorOr<void *> alloc_stack(size_t stacksize, size_t guardsize) {
       -1,                          // Not backed by any file
       0                            // No offset
   );
-  if (mmap_result < 0 && (uintptr_t(mmap_result) >= UINTPTR_MAX - size))
+  if (!linux_utils::is_valid_mmap(mmap_result))
     return Error{int(-mmap_result)};
 
   if (guardsize) {
diff --git a/libc/src/sys/mman/linux/mmap.cpp b/libc/src/sys/mman/linux/mmap.cpp
index 33f9fe8..76a6611 100644
--- a/libc/src/sys/mman/linux/mmap.cpp
+++ b/libc/src/sys/mman/linux/mmap.cpp
@@ -56,7 +56,7 @@ LLVM_LIBC_FUNCTION(void *, mmap,
   // However, since a valid return address cannot be within the last page, a
   // return value corresponding to a location in the last page is an error
   // value.
-  if (ret < 0 && ret > -EXEC_PAGESIZE) {
+  if (!linux_utils::is_valid_mmap(ret)) {
     libc_errno = static_cast<int>(-ret);
     return MAP_FAILED;
   }
diff --git a/libc/startup/linux/aarch64/tls.cpp b/libc/startup/linux/aarch64/tls.cpp
index ea1b50c..7847797 100644
--- a/libc/startup/linux/aarch64/tls.cpp
+++ b/libc/startup/linux/aarch64/tls.cpp
@@ -62,7 +62,7 @@ void init_tls(TLSDescriptor &tls_descriptor) {
                                          MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
   // We cannot check the return value with MAP_FAILED as that is the return
   // of the mmap function and not the mmap syscall.
-  if (mmap_ret_val < 0 && static_cast<uintptr_t>(mmap_ret_val) > -app.page_size)
+  if (!linux_utils::is_valid_mmap(mmap_ret_val))
     syscall_impl<long>(SYS_exit, 1);
   uintptr_t thread_ptr = uintptr_t(reinterpret_cast<uintptr_t *>(mmap_ret_val));
   uintptr_t tls_addr = thread_ptr + size_of_pointers + padding;
diff --git a/libc/startup/linux/riscv/tls.cpp b/libc/startup/linux/riscv/tls.cpp
index 04d44e6..e9d23c5 100644
--- a/libc/startup/linux/riscv/tls.cpp
+++ b/libc/startup/linux/riscv/tls.cpp
@@ -50,7 +50,7 @@ void init_tls(TLSDescriptor &tls_descriptor) {
                                          MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
   // We cannot check the return value with MAP_FAILED as that is the return
   // of the mmap function and not the mmap syscall.
-  if (mmap_ret_val < 0 && static_cast<uintptr_t>(mmap_ret_val) > -app.page_size)
+  if (!linux_utils::is_valid_mmap(mmap_ret_val))
     syscall_impl<long>(SYS_exit, 1);
   uintptr_t thread_ptr = uintptr_t(reinterpret_cast<uintptr_t *>(mmap_ret_val));
   uintptr_t tls_addr = thread_ptr + size_of_pointers + padding;
diff --git a/libc/startup/linux/x86_64/tls.cpp b/libc/startup/linux/x86_64/tls.cpp
index d6b549a..497f744 100644
--- a/libc/startup/linux/x86_64/tls.cpp
+++ b/libc/startup/linux/x86_64/tls.cpp
@@ -53,7 +53,7 @@ void init_tls(TLSDescriptor &tls_descriptor) {
       MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
   // We cannot check the return value with MAP_FAILED as that is the return
   // of the mmap function and not the mmap syscall.
-  if (mmap_retval < 0 && static_cast<uintptr_t>(mmap_retval) > -app.page_size)
+  if (!linux_utils::is_valid_mmap(mmap_retval))
     syscall_impl<long>(SYS_exit, 1);
   uintptr_t *tls_addr = reinterpret_cast<uintptr_t *>(mmap_retval);
 
diff --git a/libcxx/docs/TestingLibcxx.rst b/libcxx/docs/TestingLibcxx.rst
index 6171629..dbe6948 100644
--- a/libcxx/docs/TestingLibcxx.rst
+++ b/libcxx/docs/TestingLibcxx.rst
@@ -419,10 +419,10 @@ writing tests easier. See `libc++-specific Lit Directives`_ for more information
      - ``// FILE_DEPENDENCIES: file, directory, /path/to/file, ...``
      - The paths given to the ``FILE_DEPENDENCIES`` directive can specify directories or specific files upon which a given test depend. For example, a test that requires some test
        input stored in a data file would use this libc++-specific Lit directive. When a test file contains the ``FILE_DEPENDENCIES`` directive, Lit will collect the named files and copy
-       them to the directory represented by the ``%T`` substitution before the test executes. The copy is performed from the directory represented by the ``%S`` substitution
+       them to the directory represented by the ``%{temp}`` substitution before the test executes. The copy is performed from the directory represented by the ``%S`` substitution
        (i.e. the source directory of the test being executed) which makes it possible to use relative paths to specify the location of dependency files. After Lit copies
-       all the dependent files to the directory specified by the ``%T`` substitution, that directory should contain *all* the necessary inputs to run. In other words,
-       it should be possible to copy the contents of the directory specified by the ``%T`` substitution to a remote host where the execution of the test will actually occur.
+       all the dependent files to the directory specified by the ``%{temp}`` substitution, that directory should contain *all* the necessary inputs to run. In other words,
+       it should be possible to copy the contents of the directory specified by the ``%{temp}`` substitution to a remote host where the execution of the test will actually occur.
    * - ``ADDITIONAL_COMPILE_FLAGS``
      - ``// ADDITIONAL_COMPILE_FLAGS: flag1 flag2 ...``
      - The additional compiler flags specified by a space-separated list to the ``ADDITIONAL_COMPILE_FLAGS`` libc++-specific Lit directive will be added to the end of the ``%{compile_flags}``
diff --git a/libcxx/include/string b/libcxx/include/string
index f5e05d8..8f80afbc 100644
--- a/libcxx/include/string
+++ b/libcxx/include/string
@@ -1312,6 +1312,8 @@ public:
 #  if _LIBCPP_STD_VER >= 23
   template <class _Op>
   _LIBCPP_HIDE_FROM_ABI constexpr void resize_and_overwrite(size_type __n, _Op __op) {
+    using __result_type = decltype(std::move(__op)(data(), auto(__n)));
+    static_assert(__integer_like<__result_type>, "Operation return type must be integer-like");
     size_type __sz  = size();
     size_type __cap = capacity();
     if (__n > __cap)
diff --git a/libcxx/test/benchmarks/spec.gen.py b/libcxx/test/benchmarks/spec.gen.py
index c36dd0a..f355b2c 100644
--- a/libcxx/test/benchmarks/spec.gen.py
+++ b/libcxx/test/benchmarks/spec.gen.py
@@ -8,13 +8,13 @@
 
 # REQUIRES: enable-spec-benchmarks
 
-# RUN: mkdir -p %T
-# RUN: echo "%{cxx}" > %T/cxx.subs
-# RUN: echo "%{compile_flags}" > %T/compile_flags.subs
-# RUN: echo "%{flags}" > %T/flags.subs
-# RUN: echo "%{link_flags}" > %T/link_flags.subs
-# RUN: echo "%{spec_dir}" > %T/spec_dir.subs
-# RUN: %{python} %s %T
+# RUN: mkdir -p %{temp}
+# RUN: echo "%{cxx}" > %{temp}/cxx.subs
+# RUN: echo "%{compile_flags}" > %{temp}/compile_flags.subs
+# RUN: echo "%{flags}" > %{temp}/flags.subs
+# RUN: echo "%{link_flags}" > %{temp}/link_flags.subs
+# RUN: echo "%{spec_dir}" > %{temp}/spec_dir.subs
+# RUN: %{python} %s %{temp}
 # END.
 
 import json
@@ -66,18 +66,18 @@ spec_benchmarks &= no_fortran
 
 for benchmark in spec_benchmarks:
     print(f'#--- {benchmark}.sh.test')
-    print(f'RUN: rm -rf %T') # clean up any previous (potentially incomplete) run
-    print(f'RUN: mkdir %T')
-    print(f'RUN: cp {spec_config} %T/spec-config.cfg')
-    print(f'RUN: %{{spec_dir}}/bin/runcpu --config %T/spec-config.cfg --size train --output-root %T --rebuild {benchmark}')
-    print(f'RUN: rm -rf %T/benchspec') # remove the temporary directory, which can become quite large
+    print(f'RUN: rm -rf %{{temp}}') # clean up any previous (potentially incomplete) run
+    print(f'RUN: mkdir %{{temp}}')
+    print(f'RUN: cp {spec_config} %{{temp}}/spec-config.cfg')
+    print(f'RUN: %{{spec_dir}}/bin/runcpu --config %{{temp}}/spec-config.cfg --size train --output-root %{{temp}} --rebuild {benchmark}')
+    print(f'RUN: rm -rf %{{temp}}/benchspec') # remove the temporary directory, which can become quite large
 
     # The `runcpu` command above doesn't fail even if the benchmark fails to run. To determine failure, parse the CSV
     # results and ensure there are no compilation errors or runtime errors in the status row. Also print the logs and
     # fail if there are no CSV files at all, which implies a SPEC error.
-    print(f'RUN: %{{libcxx-dir}}/utils/parse-spec-results --extract "Base Status" --keep-failed %T/result/*.train.csv > %T/status || ! cat %T/result/*.log')
-    print(f'RUN: ! grep -E "CE|RE" %T/status || ! cat %T/result/*.log')
+    print(f'RUN: %{{libcxx-dir}}/utils/parse-spec-results --extract "Base Status" --keep-failed %{{temp}}/result/*.train.csv > %{{temp}}/status || ! cat %{{temp}}/result/*.log')
+    print(f'RUN: ! grep -E "CE|RE" %{{temp}}/status || ! cat %{{temp}}/result/*.log')
 
     # If there were no errors, parse the results into LNT-compatible format and print them.
-    print(f'RUN: %{{libcxx-dir}}/utils/parse-spec-results %T/result/*.train.csv --output-format=lnt > %T/results.lnt')
-    print(f'RUN: cat %T/results.lnt')
+    print(f'RUN: %{{libcxx-dir}}/utils/parse-spec-results %{{temp}}/result/*.train.csv --output-format=lnt > %{{temp}}/results.lnt')
+    print(f'RUN: cat %{{temp}}/results.lnt')
diff --git a/libcxx/test/configs/apple-libc++-shared.cfg.in b/libcxx/test/configs/apple-libc++-shared.cfg.in
index 5504bfd..a361b2b 100644
--- a/libcxx/test/configs/apple-libc++-shared.cfg.in
+++ b/libcxx/test/configs/apple-libc++-shared.cfg.in
@@ -38,7 +38,7 @@ config.substitutions.append(('%{link_flags}',
     '-nostdlib++ -L %{lib-dir} -lc++ %{apple-system-shims}'
 ))
 config.substitutions.append(('%{exec}',
-    '%{executor} --execdir %T --env DYLD_LIBRARY_PATH=%{lib-dir} -- '
+    '%{executor} --execdir %{temp} --env DYLD_LIBRARY_PATH=%{lib-dir} -- '
 ))
 
 config.stdlib = 'apple-libc++'
diff --git a/libcxx/test/configs/apple-libc++-system.cfg.in b/libcxx/test/configs/apple-libc++-system.cfg.in
index b59506f..e87f920 100644
--- a/libcxx/test/configs/apple-libc++-system.cfg.in
+++ b/libcxx/test/configs/apple-libc++-system.cfg.in
@@ -19,7 +19,7 @@ config.substitutions.append(('%{link_flags}',
     '-nostdlib++ -L %{lib-dir} -lc++'
 ))
 config.substitutions.append(('%{exec}',
-    '%{executor} --execdir %T -- '
+    '%{executor} --execdir %{temp} -- '
 ))
 
 config.stdlib = 'apple-libc++'
diff --git a/libcxx/test/configs/armv7m-picolibc-libc++.cfg.in b/libcxx/test/configs/armv7m-picolibc-libc++.cfg.in
index b2669a7..f0782c2 100644
--- a/libcxx/test/configs/armv7m-picolibc-libc++.cfg.in
+++ b/libcxx/test/configs/armv7m-picolibc-libc++.cfg.in
@@ -30,7 +30,7 @@ config.executor = (
     ' --cpu cortex-m3')
 config.substitutions.append(('%{exec}',
     '%{executor}'
-    ' --execdir %T'
+    ' --execdir %{temp}'
 ))
 
 import os, site
diff --git a/libcxx/test/configs/ibm-libc++-shared.cfg.in b/libcxx/test/configs/ibm-libc++-shared.cfg.in
index 0f86e74..c24c729 100644
--- a/libcxx/test/configs/ibm-libc++-shared.cfg.in
+++ b/libcxx/test/configs/ibm-libc++-shared.cfg.in
@@ -18,7 +18,7 @@ config.substitutions.append(('%{link_flags}',
     '-nostdlib++ -L %{lib-dir} -lc++ -lc++abi -latomic -Wl,-bbigtoc'
 ))
 config.substitutions.append(('%{exec}',
-    '%{executor} --execdir %T --env LIBPATH=%{lib-dir} -- '
+    '%{executor} --execdir %{temp} --env LIBPATH=%{lib-dir} -- '
 ))
 
 # LIBCXX-AIX-FIXME is the feature name used to XFAIL the
diff --git a/libcxx/test/configs/llvm-libc++-android.cfg.in b/libcxx/test/configs/llvm-libc++-android.cfg.in
index 9362c68..96c952d 100644
--- a/libcxx/test/configs/llvm-libc++-android.cfg.in
+++ b/libcxx/test/configs/llvm-libc++-android.cfg.in
@@ -36,7 +36,7 @@ config.substitutions.append(('%{link_flags}',
 config.substitutions.append(('%{exec}',
     '%{executor}' +
     ' --job-limit-socket ' + libcxx.test.android.adb_job_limit_socket() +
-    ' --prepend-path-env LD_LIBRARY_PATH /data/local/tmp/libc++ --execdir %T -- '
+    ' --prepend-path-env LD_LIBRARY_PATH /data/local/tmp/libc++ --execdir %{temp} -- '
 ))
 
 libcxx.test.config.configure(
diff --git a/libcxx/test/configs/llvm-libc++-mingw.cfg.in b/libcxx/test/configs/llvm-libc++-mingw.cfg.in
index 01c4d58..4473171 100644
--- a/libcxx/test/configs/llvm-libc++-mingw.cfg.in
+++ b/libcxx/test/configs/llvm-libc++-mingw.cfg.in
@@ -11,7 +11,7 @@ config.substitutions.append(('%{link_flags}',
     '-nostdlib++ -L %{lib-dir} -lc++'
 ))
 config.substitutions.append(('%{exec}',
-    '%{executor} --execdir %T --prepend_env PATH=%{install-prefix}/bin -- '
+    '%{executor} --execdir %{temp} --prepend_env PATH=%{install-prefix}/bin -- '
 ))
 
 import os, site
diff --git a/libcxx/test/configs/llvm-libc++-shared-clangcl.cfg.in b/libcxx/test/configs/llvm-libc++-shared-clangcl.cfg.in
index 5fa99bc..e6186a7 100644
--- a/libcxx/test/configs/llvm-libc++-shared-clangcl.cfg.in
+++ b/libcxx/test/configs/llvm-libc++-shared-clangcl.cfg.in
@@ -25,7 +25,7 @@ config.substitutions.append(('%{link_flags}',
     '-nostdlib -L %{lib-dir} -lc++ -l' + cxx_lib
 ))
 config.substitutions.append(('%{exec}',
-    '%{executor} --execdir %T --prepend_env PATH=%{install-prefix}/bin -- '
+    '%{executor} --execdir %{temp} --prepend_env PATH=%{install-prefix}/bin -- '
 ))
 
 import os, site
diff --git a/libcxx/test/configs/llvm-libc++-shared-gcc.cfg.in b/libcxx/test/configs/llvm-libc++-shared-gcc.cfg.in
index 649bd31..1b4ddc2 100644
--- a/libcxx/test/configs/llvm-libc++-shared-gcc.cfg.in
+++ b/libcxx/test/configs/llvm-libc++-shared-gcc.cfg.in
@@ -12,7 +12,7 @@ config.substitutions.append(('%{link_flags}',
     '-nostdlib++ -L %{lib-dir} -Wl,-rpath,%{lib-dir} -lc++ -lm'
 ))
 config.substitutions.append(('%{exec}',
-    '%{executor} --execdir %T -- '
+    '%{executor} --execdir %{temp} -- '
 ))
 
 import os, site
diff --git a/libcxx/test/configs/llvm-libc++-shared-no-vcruntime-clangcl.cfg.in b/libcxx/test/configs/llvm-libc++-shared-no-vcruntime-clangcl.cfg.in
index e95999d..476c5ca 100644
--- a/libcxx/test/configs/llvm-libc++-shared-no-vcruntime-clangcl.cfg.in
+++ b/libcxx/test/configs/llvm-libc++-shared-no-vcruntime-clangcl.cfg.in
@@ -26,7 +26,7 @@ config.substitutions.append(('%{link_flags}',
     '-nostdlib -L %{lib-dir} -lc++ -l' + cxx_lib
 ))
 config.substitutions.append(('%{exec}',
-    '%{executor} --execdir %T --prepend_env PATH=%{install-prefix}/bin -- '
+    '%{executor} --execdir %{temp} --prepend_env PATH=%{install-prefix}/bin -- '
 ))
 
 import os, site
diff --git a/libcxx/test/configs/llvm-libc++-shared.cfg.in b/libcxx/test/configs/llvm-libc++-shared.cfg.in
index 0c059f0..6d4a383 100644
--- a/libcxx/test/configs/llvm-libc++-shared.cfg.in
+++ b/libcxx/test/configs/llvm-libc++-shared.cfg.in
@@ -13,7 +13,7 @@ config.substitutions.append(('%{link_flags}',
     '-nostdlib++ -L %{lib-dir} -Wl,-rpath,%{lib-dir} -lc++'
 ))
 config.substitutions.append(('%{exec}',
-    '%{executor} --execdir %T -- '
+    '%{executor} --execdir %{temp} -- '
 ))
 
 import os, site
diff --git a/libcxx/test/configs/llvm-libc++-static-clangcl.cfg.in b/libcxx/test/configs/llvm-libc++-static-clangcl.cfg.in
index 4b6b3fc..2f2b420 100644
--- a/libcxx/test/configs/llvm-libc++-static-clangcl.cfg.in
+++ b/libcxx/test/configs/llvm-libc++-static-clangcl.cfg.in
@@ -25,7 +25,7 @@ config.substitutions.append(('%{link_flags}',
     '-nostdlib -L %{lib-dir} -llibc++ -l' + cxx_lib
 ))
 config.substitutions.append(('%{exec}',
-    '%{executor} --execdir %T -- '
+    '%{executor} --execdir %{temp} -- '
 ))
 
 import os, site
diff --git a/libcxx/test/configs/llvm-libc++-static.cfg.in b/libcxx/test/configs/llvm-libc++-static.cfg.in
index 097cc4d..45a44c1 100644
--- a/libcxx/test/configs/llvm-libc++-static.cfg.in
+++ b/libcxx/test/configs/llvm-libc++-static.cfg.in
@@ -13,7 +13,7 @@ config.substitutions.append(('%{link_flags}',
     '-nostdlib++ -L %{lib-dir} -lc++ -lc++abi'
 ))
 config.substitutions.append(('%{exec}',
-    '%{executor} --execdir %T -- '
+    '%{executor} --execdir %{temp} -- '
 ))
 
 import os, site
diff --git a/libcxx/test/configs/stdlib-libstdc++.cfg.in b/libcxx/test/configs/stdlib-libstdc++.cfg.in
index 3ff0c54..1a4b47c 100644
--- a/libcxx/test/configs/stdlib-libstdc++.cfg.in
+++ b/libcxx/test/configs/stdlib-libstdc++.cfg.in
@@ -51,7 +51,7 @@ config.substitutions.append(('%{link_flags}',
     '-nostdlib++ -L %{libstdcxx-install-prefix}/lib/gcc/%{libstdcxx-version} -Wl,-rpath,%{libstdcxx-install-prefix}/lib/gcc/%{libstdcxx-version} -lstdc++'
 ))
 config.substitutions.append(('%{exec}',
-    '%{executor} --execdir %T -- '
+    '%{executor} --execdir %{temp} -- '
 ))
 
 import os, site
diff --git a/libcxx/test/configs/stdlib-native.cfg.in b/libcxx/test/configs/stdlib-native.cfg.in
index 3e25c1e..b827155f 100644
--- a/libcxx/test/configs/stdlib-native.cfg.in
+++ b/libcxx/test/configs/stdlib-native.cfg.in
@@ -11,7 +11,7 @@ config.substitutions.append(('%{flags}',
 ))
 config.substitutions.append(('%{compile_flags}', '-I %{libcxx-dir}/test/support'))
 config.substitutions.append(('%{link_flags}', ''))
-config.substitutions.append(('%{exec}', '%{executor} --execdir %T -- '))
+config.substitutions.append(('%{exec}', '%{executor} --execdir %{temp} -- '))
 
 import os, site
 site.addsitedir(os.path.join('@LIBCXX_SOURCE_DIR@', 'utils'))
diff --git a/libcxx/test/selftest/dsl/dsl.sh.py b/libcxx/test/selftest/dsl/dsl.sh.py
index 6d4406b..93f351f 100644
--- a/libcxx/test/selftest/dsl/dsl.sh.py
+++ b/libcxx/test/selftest/dsl/dsl.sh.py
@@ -12,7 +12,7 @@
 
 # Note: We prepend arguments with 'x' to avoid thinking there are too few
 #       arguments in case an argument is an empty string.
-# RUN: %{python} %s x%S x%T x%{substitutions}
+# RUN: %{python} %s x%S x%{temp} x%{substitutions}
 
 import base64
 import copy
diff --git a/libcxx/test/selftest/dsl/lit.local.cfg b/libcxx/test/selftest/dsl/lit.local.cfg
index 352719b..dc6887a 100644
--- a/libcxx/test/selftest/dsl/lit.local.cfg
+++ b/libcxx/test/selftest/dsl/lit.local.cfg
@@ -1,8 +1,8 @@
 # Since we try to pass substitutions as-is to some tests, we must "escape"
 # them in case they contain other substitutions. Otherwise, the substitutions
 # will be fully expanded when passed to the tests. For example, we want an
-# %{exec} substitution that contains `--execdir %T` to be passed as-is, without
-# substituting the directory. This way, the test itself can populate %T as it
+# %{exec} substitution that contains `--execdir %{temp}` to be passed as-is, without
+# substituting the directory. This way, the test itself can populate %{temp} as it
 # sees fit, and %{exec} will respect it.
 #
 # To solve this problem, we pickle the substitutions and base64 encode that
diff --git a/libcxx/test/selftest/file_dependencies/absolute-and-relative-paths.sh.cpp b/libcxx/test/selftest/file_dependencies/absolute-and-relative-paths.sh.cpp
index ac52f67..68283f6 100644
--- a/libcxx/test/selftest/file_dependencies/absolute-and-relative-paths.sh.cpp
+++ b/libcxx/test/selftest/file_dependencies/absolute-and-relative-paths.sh.cpp
@@ -9,7 +9,7 @@
 // Make sure that FILE_DEPENDENCIES work with relative AND absolute paths.
 
 // FILE_DEPENDENCIES: %S/a.txt
-// RUN: test -e %T/a.txt
+// RUN: test -e %{temp}/a.txt
 
 // FILE_DEPENDENCIES: dir/b.txt
-// RUN: test -e %T/b.txt
+// RUN: test -e %{temp}/b.txt
diff --git a/libcxx/test/selftest/file_dependencies/substitute-in-dependencies.sh.cpp b/libcxx/test/selftest/file_dependencies/substitute-in-dependencies.sh.cpp
index c63684c..59de718 100644
--- a/libcxx/test/selftest/file_dependencies/substitute-in-dependencies.sh.cpp
+++ b/libcxx/test/selftest/file_dependencies/substitute-in-dependencies.sh.cpp
@@ -9,4 +9,4 @@
 // Make sure that lit substitutions are expanded inside FILE_DEPENDENCIES lines.
 
 // FILE_DEPENDENCIES: %s
-// RUN: test -e %T/substitute-in-dependencies.sh.cpp
+// RUN: test -e %{temp}/substitute-in-dependencies.sh.cpp
diff --git a/libcxx/test/selftest/tmpdir-exists.sh.cpp b/libcxx/test/selftest/tmpdir-exists.sh.cpp
index 7f9e69d..4edd165 100644
--- a/libcxx/test/selftest/tmpdir-exists.sh.cpp
+++ b/libcxx/test/selftest/tmpdir-exists.sh.cpp
@@ -6,6 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-// Make sure that the directory represented by %T exists when we run the test.
+// Make sure that the directory represented by %{temp} exists when we run the test.
 
-// RUN: test -d %T
+// RUN: test -d %{temp}
diff --git a/libcxx/test/std/strings/basic.string/string.capacity/resize_and_overwrite.pass.cpp b/libcxx/test/std/strings/basic.string/string.capacity/resize_and_overwrite.pass.cpp
index abd2848..e5c1963 100644
--- a/libcxx/test/std/strings/basic.string/string.capacity/resize_and_overwrite.pass.cpp
+++ b/libcxx/test/std/strings/basic.string/string.capacity/resize_and_overwrite.pass.cpp
@@ -21,6 +21,7 @@
 #include "make_string.h"
 #include "test_macros.h"
 #include "asan_testing.h"
+#include "type_algorithms.h"
 
 template <class S>
 constexpr void test_appending(std::size_t k, size_t N, size_t new_capacity) {
@@ -77,17 +78,30 @@ constexpr bool test() {
   return true;
 }
 
-void test_value_categories() {
+constexpr bool test_value_categories() {
   std::string s;
   s.resize_and_overwrite(10, [](char*&&, std::size_t&&) { return 0; });
   LIBCPP_ASSERT(is_string_asan_correct(s));
   s.resize_and_overwrite(10, [](char* const&, const std::size_t&) { return 0; });
   LIBCPP_ASSERT(is_string_asan_correct(s));
   struct RefQualified {
-    int operator()(char*, std::size_t) && { return 0; }
+    constexpr int operator()(char*, std::size_t) && { return 0; }
   };
   s.resize_and_overwrite(10, RefQualified{});
   LIBCPP_ASSERT(is_string_asan_correct(s));
+  return true;
+}
+
+constexpr bool test_integer_like_return_types() {
+  types::for_each(types::integer_types(), []<typename IntegerType> {
+    std::string s;
+    s.resize_and_overwrite(10, [](char* p, std::size_t n) -> IntegerType {
+      std::fill(p, p + n, 'f');
+      return n;
+    });
+    assert(s.size() == 10);
+  });
+  return true;
 }
 
 int main(int, char**) {
@@ -105,5 +119,12 @@ int main(int, char**) {
   test<std::basic_string<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t>>>();
   static_assert(test<std::basic_string<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t>>>());
 #endif
+
+  test_value_categories();
+  test_integer_like_return_types();
+
+  static_assert(test_value_categories());
+  static_assert(test_integer_like_return_types());
+
   return 0;
 }
diff --git a/libcxx/test/std/strings/basic.string/string.capacity/resize_and_overwrite.verify.cpp b/libcxx/test/std/strings/basic.string/string.capacity/resize_and_overwrite.verify.cpp
new file mode 100644
index 0000000..323f49b
--- /dev/null
+++ b/libcxx/test/std/strings/basic.string/string.capacity/resize_and_overwrite.verify.cpp
@@ -0,0 +1,40 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// REQUIRES: std-at-least-c++23
+
+// <string>
+
+// template<class Operation>
+// void resize_and_overwrite(size_type n, Operation op)
+
+// Verify that the operation's return type must be integer-like
+
+#include <string>
+
+void test_bool_return_type() {
+  std::string s;
+  s.resize_and_overwrite(10, [](char*, std::size_t) {
+    return true; // expected-error-re@*:* {{{{(static_assertion|static assertion)}}{{.*}}integer-like}}
+  });
+}
+
+void test_pointer_return_type() {
+  std::string s;
+  s.resize_and_overwrite(10, [](char* p, std::size_t) {
+    return p; // expected-error-re@*:* {{{{(static_assertion|static assertion)}}{{.*}}integer-like}}
+              // expected-error@*:* {{cannot initialize}}
+  });
+}
+
+void test_float_return_type() {
+  std::string s;
+  s.resize_and_overwrite(10, [](char*, std::size_t) {
+    return 5.0f; // expected-error-re@*:* {{{{(static_assertion|static assertion)}}{{.*}}integer-like}}
+  });
+}
diff --git a/libcxx/utils/libcxx/test/dsl.py b/libcxx/utils/libcxx/test/dsl.py
index 2d3a72c..3fb30d8 100644
--- a/libcxx/utils/libcxx/test/dsl.py
+++ b/libcxx/utils/libcxx/test/dsl.py
@@ -111,8 +111,8 @@ def _makeConfigTest(config):
             os.makedirs(supportDir)
 
     # Create a dummy test suite and single dummy test inside it. As part of
-    # the Lit configuration, automatically do the equivalent of 'mkdir %T'
-    # and 'rm -r %T' to avoid cluttering the build directory.
+    # the Lit configuration, automatically do the equivalent of 'mkdir %{temp}'
+    # and 'rm -r %{temp}' to avoid cluttering the build directory.
     suite = lit.Test.TestSuite("__config__", sourceRoot, execRoot, config)
     tmp = tempfile.NamedTemporaryFile(dir=sourceRoot, delete=False, suffix=".cpp")
     tmp.close()
diff --git a/libcxx/utils/libcxx/test/format.py b/libcxx/utils/libcxx/test/format.py
index c9dffd1..3fcd250 100644
--- a/libcxx/utils/libcxx/test/format.py
+++ b/libcxx/utils/libcxx/test/format.py
@@ -17,10 +17,10 @@ LIBCXX_UTILS = os.path.dirname(os.path.dirname(os.path.dirname(THIS_FILE)))
 
 def _getTempPaths(test):
     """
-    Return the values to use for the %T and %t substitutions, respectively.
+    Return the values to use for the %{temp} and %t substitutions, respectively.
 
     The difference between this and Lit's default behavior is that we guarantee
-    that %T is a path unique to the test being run.
+    that %{temp} is a path unique to the test being run.
     """
     tmpDir, _ = lit.TestRunner.getTempPaths(test)
     _, testName = os.path.split(test.getExecPath())
@@ -92,7 +92,7 @@ def parseScript(test, preamble):
     #       errors, which doesn't make sense for clang-verify tests because we may want to check
     #       for specific warning diagnostics.
     _checkBaseSubstitutions(substitutions)
-    substitutions.append(("%T", tmpDir))
+    substitutions.append(("%{temp}", tmpDir))
     substitutions.append(
         ("%{build}", "%{cxx} %s %{flags} %{compile_flags} %{link_flags} -o %t.exe")
     )
@@ -150,7 +150,7 @@ def parseScript(test, preamble):
     # that file to the execution directory. Execute the copy from %S to allow
     # relative paths from the test directory.
     for dep in fileDependencies:
-        script += ["%dbg(SETUP) cd %S && cp {} %T".format(dep)]
+        script += ["%dbg(SETUP) cd %S && cp {} %{{temp}}".format(dep)]
     script += preamble
     script += scriptInTest
 
@@ -178,11 +178,11 @@ def parseScript(test, preamble):
                 "%dbg(MODULE std.compat) %{cxx} %{flags} "
                 f"{compileFlags} "
                 "-Wno-reserved-module-identifier -Wno-reserved-user-defined-literal "
-                "-fmodule-file=std=%T/std.pcm " # The std.compat module imports std.
-                "--precompile -o %T/std.compat.pcm -c %{module-dir}/std.compat.cppm",
+                "-fmodule-file=std=%{temp}/std.pcm " # The std.compat module imports std.
+                "--precompile -o %{temp}/std.compat.pcm -c %{module-dir}/std.compat.cppm",
             )
             moduleCompileFlags.extend(
-                ["-fmodule-file=std.compat=%T/std.compat.pcm", "%T/std.compat.pcm"]
+                ["-fmodule-file=std.compat=%{temp}/std.compat.pcm", "%{temp}/std.compat.pcm"]
             )
 
         # Make sure the std module is built before std.compat. Libc++'s
@@ -195,9 +195,9 @@ def parseScript(test, preamble):
             "%dbg(MODULE std) %{cxx} %{flags} "
             f"{compileFlags} "
             "-Wno-reserved-module-identifier -Wno-reserved-user-defined-literal "
-            "--precompile -o %T/std.pcm -c %{module-dir}/std.cppm",
+            "--precompile -o %{temp}/std.pcm -c %{module-dir}/std.cppm",
         )
-        moduleCompileFlags.extend(["-fmodule-file=std=%T/std.pcm", "%T/std.pcm"])
+        moduleCompileFlags.extend(["-fmodule-file=std=%{temp}/std.pcm", "%{temp}/std.pcm"])
 
         # Add compile flags required for the modules.
         substitutions = config._appendToSubstitution(
@@ -261,6 +261,10 @@ class CxxStandardLibraryTest(lit.formats.FileBasedTest):
         %{run}
             Equivalent to `%{exec} %t.exe`. This is intended to be used
             in conjunction with the %{build} substitution.
+
+        %{temp}
+            This substitution expands to a non-existent temporary path unique to the test.
+            It is typically used to create a temporary directory.
     """
 
     def getTestsForPath(self, testSuite, pathInSuite, litConfig, localConfig):
@@ -355,9 +359,9 @@ class CxxStandardLibraryTest(lit.formats.FileBasedTest):
                 "%dbg(COMPILED WITH) %{cxx} %s %{flags} %{compile_flags} %{benchmark_flags} %{link_flags} -o %t.exe",
             ]
             if "enable-benchmarks=run" in test.config.available_features:
-                steps += ["%dbg(EXECUTED AS) %{exec} %t.exe --benchmark_out=%T/benchmark-result.json --benchmark_out_format=json"]
+                steps += ["%dbg(EXECUTED AS) %{exec} %t.exe --benchmark_out=%{temp}/benchmark-result.json --benchmark_out_format=json"]
                 parse_results = os.path.join(LIBCXX_UTILS, 'parse-google-benchmark-results')
-                steps += [f"{parse_results} %T/benchmark-result.json --output-format=lnt > %T/results.lnt"]
+                steps += [f"{parse_results} %{temp}/benchmark-result.json --output-format=lnt > %{temp}/results.lnt"]
             return self._executeShTest(test, litConfig, steps)
         elif re.search('[.]gen[.][^.]+$', filename): # This only happens when a generator test is not supported
             return self._executeShTest(test, litConfig, [])
diff --git a/libcxx/utils/ssh.py b/libcxx/utils/ssh.py
index ec16efc..77e79ce 100755
--- a/libcxx/utils/ssh.py
+++ b/libcxx/utils/ssh.py
@@ -57,7 +57,7 @@ def main():
         return subprocess.run(command, *args_, **kwargs)
 
     # Create a temporary directory where the test will be run.
-    # That is effectively the value of %T on the remote host.
+    # That is effectively the value of %{temp} on the remote host.
     tmp = runCommand(
         ssh("mktemp -d {}/libcxx.XXXXXXXXXX".format(args.tempdir)),
         universal_newlines=True,
diff --git a/libcxx/utils/test-at-commit b/libcxx/utils/test-at-commit
index d62643d..f20bf5f 100755
--- a/libcxx/utils/test-at-commit
+++ b/libcxx/utils/test-at-commit
@@ -22,7 +22,7 @@ config.substitutions.append(('%{{flags}}',
 ))
 config.substitutions.append(('%{{compile_flags}}', '-nostdinc++ -I {INSTALL_ROOT}/include/c++/v1 -I %{{libcxx-dir}}/test/support'))
 config.substitutions.append(('%{{link_flags}}', '-nostdlib++ -L {INSTALL_ROOT}/lib -Wl,-rpath,{INSTALL_ROOT}/lib -lc++'))
-config.substitutions.append(('%{{exec}}', '%{{executor}} --execdir %T -- '))
+config.substitutions.append(('%{{exec}}', '%{{executor}} --execdir %{{temp}} -- '))
 
 import os, site
 site.addsitedir(os.path.join('@LIBCXX_SOURCE_DIR@', 'utils'))
diff --git a/libcxxabi/test/configs/apple-libc++abi-shared.cfg.in b/libcxxabi/test/configs/apple-libc++abi-shared.cfg.in
index 537fe82..0da074b 100644
--- a/libcxxabi/test/configs/apple-libc++abi-shared.cfg.in
+++ b/libcxxabi/test/configs/apple-libc++abi-shared.cfg.in
@@ -36,7 +36,7 @@ config.substitutions.append(('%{link_flags}',
     '-nostdlib++ -L %{lib} -lc++ %{apple-system-shims}'
 ))
 config.substitutions.append(('%{exec}',
-    '%{executor} --execdir %T --env DYLD_LIBRARY_PATH=%{lib} -- '
+    '%{executor} --execdir %{temp} --env DYLD_LIBRARY_PATH=%{lib} -- '
 ))
 
 config.stdlib = 'apple-libc++'
diff --git a/libcxxabi/test/configs/apple-libc++abi-system.cfg.in b/libcxxabi/test/configs/apple-libc++abi-system.cfg.in
index 1e80eee..0e62e03 100644
--- a/libcxxabi/test/configs/apple-libc++abi-system.cfg.in
+++ b/libcxxabi/test/configs/apple-libc++abi-system.cfg.in
@@ -20,7 +20,7 @@ config.substitutions.append(('%{link_flags}',
     '-nostdlib++ -L %{lib} -lc++'
 ))
 config.substitutions.append(('%{exec}',
-    '%{executor} --execdir %T -- '
+    '%{executor} --execdir %{temp} -- '
 ))
 
 config.stdlib = 'apple-libc++'
diff --git a/libcxxabi/test/configs/armv7m-picolibc-libc++abi.cfg.in b/libcxxabi/test/configs/armv7m-picolibc-libc++abi.cfg.in
index 0594ba4..e61efbb 100644
--- a/libcxxabi/test/configs/armv7m-picolibc-libc++abi.cfg.in
+++ b/libcxxabi/test/configs/armv7m-picolibc-libc++abi.cfg.in
@@ -25,7 +25,7 @@ config.executor = (
     ' --cpu cortex-m3')
 config.substitutions.append(('%{exec}',
     '%{executor}'
-    ' --execdir %T'
+    ' --execdir %{temp}'
 ))
 
 import os, site
diff --git a/libcxxabi/test/configs/ibm-libc++abi-shared.cfg.in b/libcxxabi/test/configs/ibm-libc++abi-shared.cfg.in
index bd6c1fb..238cfdb 100644
--- a/libcxxabi/test/configs/ibm-libc++abi-shared.cfg.in
+++ b/libcxxabi/test/configs/ibm-libc++abi-shared.cfg.in
@@ -19,7 +19,7 @@ config.substitutions.append(('%{link_flags}',
     '-nostdlib++ -L %{lib} -lc++ -lc++abi -Wl,-bbigtoc'
 ))
 config.substitutions.append(('%{exec}',
-    '%{executor} --execdir %T --env LIBPATH=%{lib} -- '
+    '%{executor} --execdir %{temp} --env LIBPATH=%{lib} -- '
 ))
 
 import os, site
diff --git a/libcxxabi/test/configs/llvm-libc++abi-android.cfg.in b/libcxxabi/test/configs/llvm-libc++abi-android.cfg.in
index bc58446..e8bc3c9 100644
--- a/libcxxabi/test/configs/llvm-libc++abi-android.cfg.in
+++ b/libcxxabi/test/configs/llvm-libc++abi-android.cfg.in
@@ -29,7 +29,7 @@ config.substitutions.append(('%{link_flags}',
 config.substitutions.append(('%{exec}',
     '%{executor}' +
     ' --job-limit-socket ' + libcxx.test.android.adb_job_limit_socket() +
-    ' --prepend-path-env LD_LIBRARY_PATH /data/local/tmp/libc++ --execdir %T -- '
+    ' --prepend-path-env LD_LIBRARY_PATH /data/local/tmp/libc++ --execdir %{temp} -- '
 ))
 
 libcxx.test.config.configure(
diff --git a/libcxxabi/test/configs/llvm-libc++abi-merged.cfg.in b/libcxxabi/test/configs/llvm-libc++abi-merged.cfg.in
index c6fa430..0aecc44 100644
--- a/libcxxabi/test/configs/llvm-libc++abi-merged.cfg.in
+++ b/libcxxabi/test/configs/llvm-libc++abi-merged.cfg.in
@@ -13,7 +13,7 @@ config.substitutions.append(('%{link_flags}',
     '-nostdlib++ -L %{lib} -Wl,-rpath,%{lib} -lc++ -pthread'
 ))
 config.substitutions.append(('%{exec}',
-    '%{executor} --execdir %T -- '
+    '%{executor} --execdir %{temp} -- '
 ))
 
 import os, site
diff --git a/libcxxabi/test/configs/llvm-libc++abi-mingw.cfg.in b/libcxxabi/test/configs/llvm-libc++abi-mingw.cfg.in
index 0d8b7bd..14f7ab8 100644
--- a/libcxxabi/test/configs/llvm-libc++abi-mingw.cfg.in
+++ b/libcxxabi/test/configs/llvm-libc++abi-mingw.cfg.in
@@ -11,7 +11,7 @@ config.substitutions.append(('%{link_flags}',
     '-nostdlib++ -L %{lib} -lc++'
 ))
 config.substitutions.append(('%{exec}',
-    '%{executor} --execdir %T --prepend_env PATH=%{install-prefix}/bin -- '
+    '%{executor} --execdir %{temp} --prepend_env PATH=%{install-prefix}/bin -- '
 ))
 
 import os, site
diff --git a/libcxxabi/test/configs/llvm-libc++abi-shared-clangcl.cfg.in b/libcxxabi/test/configs/llvm-libc++abi-shared-clangcl.cfg.in
index cfdfc0f..01cd2b2 100644
--- a/libcxxabi/test/configs/llvm-libc++abi-shared-clangcl.cfg.in
+++ b/libcxxabi/test/configs/llvm-libc++abi-shared-clangcl.cfg.in
@@ -11,7 +11,7 @@ config.substitutions.append(('%{link_flags}',
     '-nostdlib -L %{lib} -lc++ -lc++abi -lmsvcrt -lmsvcprt -loldnames'
 ))
 config.substitutions.append(('%{exec}',
-    '%{executor} --execdir %T --prepend_env PATH=%{lib} -- '
+    '%{executor} --execdir %{temp} --prepend_env PATH=%{lib} -- '
 ))
 
 import os, site
diff --git a/libcxxabi/test/configs/llvm-libc++abi-shared.cfg.in b/libcxxabi/test/configs/llvm-libc++abi-shared.cfg.in
index 6b69205..e5f18d5 100644
--- a/libcxxabi/test/configs/llvm-libc++abi-shared.cfg.in
+++ b/libcxxabi/test/configs/llvm-libc++abi-shared.cfg.in
@@ -13,7 +13,7 @@ config.substitutions.append(('%{link_flags}',
     '-nostdlib++ -L %{lib} -Wl,-rpath,%{lib} -lc++ -lc++abi -pthread'
 ))
 config.substitutions.append(('%{exec}',
-    '%{executor} --execdir %T -- '
+    '%{executor} --execdir %{temp} -- '
 ))
 
 import os, site
diff --git a/libcxxabi/test/configs/llvm-libc++abi-static-clangcl.cfg.in b/libcxxabi/test/configs/llvm-libc++abi-static-clangcl.cfg.in
index ba67c8b..9f48389 100644
--- a/libcxxabi/test/configs/llvm-libc++abi-static-clangcl.cfg.in
+++ b/libcxxabi/test/configs/llvm-libc++abi-static-clangcl.cfg.in
@@ -11,7 +11,7 @@ config.substitutions.append(('%{link_flags}',
     '-nostdlib -L %{lib} -llibc++ -llibc++abi -lmsvcrt -lmsvcprt -loldnames'
 ))
 config.substitutions.append(('%{exec}',
-    '%{executor} --execdir %T --prepend_env PATH=%{lib} -- '
+    '%{executor} --execdir %{temp} --prepend_env PATH=%{lib} -- '
 ))
 
 import os, site
diff --git a/libcxxabi/test/configs/llvm-libc++abi-static.cfg.in b/libcxxabi/test/configs/llvm-libc++abi-static.cfg.in
index 352e2c3..3fca7b3 100644
--- a/libcxxabi/test/configs/llvm-libc++abi-static.cfg.in
+++ b/libcxxabi/test/configs/llvm-libc++abi-static.cfg.in
@@ -13,7 +13,7 @@ config.substitutions.append(('%{link_flags}',
     '-nostdlib++ -L %{lib} -lc++ -lc++abi -pthread'
 ))
 config.substitutions.append(('%{exec}',
-    '%{executor} --execdir %T -- '
+    '%{executor} --execdir %{temp} -- '
 ))
 
 import os, site
diff --git a/libunwind/test/configs/apple-libunwind-system.cfg.in b/libunwind/test/configs/apple-libunwind-system.cfg.in
index e5a7c98..252448a 100644
--- a/libunwind/test/configs/apple-libunwind-system.cfg.in
+++ b/libunwind/test/configs/apple-libunwind-system.cfg.in
@@ -19,7 +19,7 @@ config.substitutions.append(('%{link_flags}',
     '-nostdlib++ -L %{lib} -lc++ -lunwind'
 ))
 config.substitutions.append(('%{exec}',
-    '%{executor} --execdir %T -- '
+    '%{executor} --execdir %{temp} -- '
 ))
 
 config.stdlib = 'apple-libc++'
diff --git a/libunwind/test/configs/armv7m-picolibc-libunwind.cfg.in b/libunwind/test/configs/armv7m-picolibc-libunwind.cfg.in
index fc54900..6ffdd70 100644
--- a/libunwind/test/configs/armv7m-picolibc-libunwind.cfg.in
+++ b/libunwind/test/configs/armv7m-picolibc-libunwind.cfg.in
@@ -25,7 +25,7 @@ config.executor = (
     ' --cpu cortex-m3')
 config.substitutions.append(('%{exec}',
     '%{executor}'
-    ' --execdir %T'
+    ' --execdir %{temp}'
 ))
 
 import os, site
diff --git a/libunwind/test/configs/ibm-libunwind-shared.cfg.in b/libunwind/test/configs/ibm-libunwind-shared.cfg.in
index 2221e0c..99f4a90 100644
--- a/libunwind/test/configs/ibm-libunwind-shared.cfg.in
+++ b/libunwind/test/configs/ibm-libunwind-shared.cfg.in
@@ -17,7 +17,7 @@ config.substitutions.append(('%{link_flags}',
     '-nostdlib++ -L %{lib} -lunwind -ldl -Wl,-bbigtoc'
 ))
 config.substitutions.append(('%{exec}',
-    '%{executor} --execdir %T --env LIBPATH=%{lib} -- '
+    '%{executor} --execdir %{temp} --env LIBPATH=%{lib} -- '
 ))
 
 import os, site
diff --git a/libunwind/test/configs/llvm-libunwind-merged.cfg.in b/libunwind/test/configs/llvm-libunwind-merged.cfg.in
index fafe129..34950f6 100644
--- a/libunwind/test/configs/llvm-libunwind-merged.cfg.in
+++ b/libunwind/test/configs/llvm-libunwind-merged.cfg.in
@@ -35,7 +35,7 @@ config.substitutions.append(('%{link_flags}',
     '-L %{{lib}} -Wl,-rpath,%{{lib}} -lc++ {}'.format(' '.join(link_flags))
 ))
 config.substitutions.append(('%{exec}',
-    '%{executor} --execdir %T -- '
+    '%{executor} --execdir %{temp} -- '
 ))
 
 import os, site
diff --git a/libunwind/test/configs/llvm-libunwind-shared-mingw.cfg.in b/libunwind/test/configs/llvm-libunwind-shared-mingw.cfg.in
index b29d83f..1e77638 100644
--- a/libunwind/test/configs/llvm-libunwind-shared-mingw.cfg.in
+++ b/libunwind/test/configs/llvm-libunwind-shared-mingw.cfg.in
@@ -11,7 +11,7 @@ config.substitutions.append(('%{link_flags}',
     '-L %{lib} -lunwind'
 ))
 config.substitutions.append(('%{exec}',
-    '%{executor} --execdir %T --prepend_env PATH=%{install-prefix}/bin -- '
+    '%{executor} --execdir %{temp} --prepend_env PATH=%{install-prefix}/bin -- '
 ))
 
 import os, site
diff --git a/libunwind/test/configs/llvm-libunwind-shared.cfg.in b/libunwind/test/configs/llvm-libunwind-shared.cfg.in
index f3e4092..61d6b61 100644
--- a/libunwind/test/configs/llvm-libunwind-shared.cfg.in
+++ b/libunwind/test/configs/llvm-libunwind-shared.cfg.in
@@ -34,7 +34,7 @@ config.substitutions.append(('%{link_flags}',
     '-L %{{lib}} -Wl,-rpath,%{{lib}} -lunwind {}'.format(' '.join(link_flags))
 ))
 config.substitutions.append(('%{exec}',
-    '%{executor} --execdir %T -- '
+    '%{executor} --execdir %{temp} -- '
 ))
 
 import os, site
diff --git a/libunwind/test/configs/llvm-libunwind-static-mingw.cfg.in b/libunwind/test/configs/llvm-libunwind-static-mingw.cfg.in
index 437c53b..37d20a7 100644
--- a/libunwind/test/configs/llvm-libunwind-static-mingw.cfg.in
+++ b/libunwind/test/configs/llvm-libunwind-static-mingw.cfg.in
@@ -11,7 +11,7 @@ config.substitutions.append(('%{link_flags}',
     '-L %{lib} -lunwind'
 ))
 config.substitutions.append(('%{exec}',
-    '%{executor} --execdir %T --prepend_env PATH=%{lib} -- '
+    '%{executor} --execdir %{temp} --prepend_env PATH=%{lib} -- '
 ))
 
 import os, site
diff --git a/libunwind/test/configs/llvm-libunwind-static.cfg.in b/libunwind/test/configs/llvm-libunwind-static.cfg.in
index a3a65ae..194fa4f 100644
--- a/libunwind/test/configs/llvm-libunwind-static.cfg.in
+++ b/libunwind/test/configs/llvm-libunwind-static.cfg.in
@@ -37,7 +37,7 @@ config.substitutions.append(('%{link_flags}',
     '%{{lib}}/libunwind.a {}'.format(' '.join(link_flags))
 ))
 config.substitutions.append(('%{exec}',
-    '%{executor} --execdir %T -- '
+    '%{executor} --execdir %{temp} -- '
 ))
 
 import os, site
diff --git a/lld/utils/speed-test-reproducers/collect.nix b/lld/utils/speed-test-reproducers/collect.nix
new file mode 100644
index 0000000..ab5f5ee
--- /dev/null
+++ b/lld/utils/speed-test-reproducers/collect.nix
@@ -0,0 +1,152 @@
+#===-----------------------------------------------------------------------===//
+#
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+#===----------------------------------------------------------------------===//
+#
+# This is a Nix recipe for collecting reproducers for benchmarking purposes in a
+# reproducible way. It works by injecting a linker wrapper that embeds a
+# reproducer tarball into a non-allocated section of every linked object, which
+# generally causes them to be smuggled out of the build tree in a section of the
+# final binaries. In principle, this technique should let us collect reproducers
+# from any project packaged by Nix without project-specific knowledge, but as
+# you can see below, many interesting ones need a few hacks.
+#
+# If you have Nix installed, you can build the reproducers with the following
+# command:
+#
+# TMPDIR=/var/tmp nix-build -j6 --log-format bar collect.nix
+#
+# This will result in building several large projects including Chromium and
+# Firefox, which will take some time, and it will also build most of the
+# dependencies for non-native targets. Eventually you will get a result
+# directory containing all the reproducers.
+#
+# The following projects have been tested successfully:
+# - chrome (native only, cross builds fail building the qtbase dependency)
+# - firefox (all targets)
+# - linux-kernel (all targets, requires patched nixpkgs)
+# - ladybird (native only, same problem as chromium)
+# - llvm (all targets)
+
+{
+  nixpkgs ? fetchTarball "https://github.com/NixOS/nixpkgs/archive/992f916556fcfaa94451ebc7fc6e396134bbf5b1.tar.gz",
+  system ? builtins.currentSystem,
+}:
+let
+  reproducerPkgs =
+    crossSystem:
+    let
+      pkgsCross = import nixpkgs { inherit crossSystem; };
+      # Wraps the given stdenv and lld package into a variant that collects
+      # the reproducer and builds with debug info.
+      reproducerCollectingStdenv =
+        stdenv: lld:
+        let
+          bintools = stdenv.cc.bintools.override {
+            extraBuildCommands = ''
+              wrap ${stdenv.cc.targetPrefix}nix-wrap-lld ${pkgsCross.path}/pkgs/build-support/bintools-wrapper/ld-wrapper.sh ${lld}/bin/ld.lld
+              export lz4=${pkgsCross.lib.getBin pkgsCross.buildPackages.lz4}/bin/lz4
+              substituteAll ${./ld-wrapper.sh} $out/bin/${stdenv.cc.targetPrefix}ld
+              chmod +x $out/bin/${stdenv.cc.targetPrefix}ld
+              substituteAll ${./ld-wrapper.sh} $out/bin/${stdenv.cc.targetPrefix}ld.lld
+              chmod +x $out/bin/${stdenv.cc.targetPrefix}ld.lld
+            '';
+          };
+        in
+        pkgsCross.withCFlags [ "-g1" ] (
+          stdenv.override (old: {
+            allowedRequisites = null;
+            cc = stdenv.cc.override { inherit bintools; };
+          })
+        );
+      withReproducerCollectingStdenv =
+        pkg:
+        pkg.override {
+          stdenv = reproducerCollectingStdenv pkgsCross.stdenv pkgsCross.buildPackages.lld;
+        };
+      withReproducerCollectingClangStdenv =
+        pkg:
+        pkg.override {
+          clangStdenv = reproducerCollectingStdenv pkgsCross.clangStdenv pkgsCross.buildPackages.lld;
+        };
+    in
+    {
+      # For benchmarking the linker we want to disable LTO as otherwise we would
+      # just be benchmarking the LLVM optimizer. Also, we generally want the
+      # package to use the regular stdenv in order to simplify wrapping it.
+      # Firefox normally uses the rustc stdenv but uses the regular one if
+      # LTO is disabled so we kill two birds with one stone by disabling it.
+      # Chromium uses the rustc stdenv unconditionally so we need the stuff
+      # below to make sure that it finds our wrapped stdenv.
+      chrome =
+        (pkgsCross.chromium.override {
+          newScope =
+            extra:
+            pkgsCross.newScope (
+              extra
+              // {
+                pkgsBuildBuild = {
+                  pkg-config = pkgsCross.pkgsBuildBuild.pkg-config;
+                  rustc = {
+                    llvmPackages = rec {
+                      stdenv = reproducerCollectingStdenv pkgsCross.pkgsBuildBuild.rustc.llvmPackages.stdenv pkgsCross.pkgsBuildBuild.rustc.llvmPackages.lld;
+                      bintools = stdenv.cc.bintools;
+                    };
+                  };
+                };
+              }
+            );
+          pkgs = {
+            rustc = {
+              llvmPackages = {
+                stdenv = reproducerCollectingStdenv pkgsCross.rustc.llvmPackages.stdenv pkgsCross.rustc.llvmPackages.lld;
+              };
+            };
+          };
+        }).browser.overrideAttrs
+          (old: {
+            configurePhase = old.configurePhase + ''
+              echo use_thin_lto = false >> out/Release/args.gn
+              echo is_cfi = false >> out/Release/args.gn
+            '';
+          });
+      firefox = (withReproducerCollectingStdenv pkgsCross.firefox-unwrapped).override {
+        ltoSupport = false;
+        pgoSupport = false;
+      };
+      # Doesn't work as-is because the kernel derivation calls the linker
+      # directly instead of the wrapper. See:
+      # https://github.com/NixOS/nixpkgs/blob/d3fdff1631946f3e51318317375d638dae3d6aa2/pkgs/os-specific/linux/kernel/common-flags.nix#L12
+      linux-kernel = (withReproducerCollectingStdenv pkgsCross.linux_latest).dev;
+      ladybird = withReproducerCollectingStdenv pkgsCross.ladybird;
+      llvm = withReproducerCollectingStdenv pkgsCross.llvm;
+      webkitgtk = withReproducerCollectingClangStdenv pkgsCross.webkitgtk;
+      hello = withReproducerCollectingStdenv pkgsCross.hello;
+    };
+  targets = {
+    x86_64 = reproducerPkgs { config = "x86_64-unknown-linux-gnu"; };
+    aarch64 = reproducerPkgs { config = "aarch64-unknown-linux-gnu"; };
+    riscv64 = reproducerPkgs { config = "riscv64-unknown-linux-gnu"; };
+  };
+  pkgs = import nixpkgs { };
+in
+pkgs.runCommand "lld-speed-test" { } ''
+  extract_reproducer() {
+    ${pkgs.coreutils}/bin/mkdir -p $out/$2
+    ${pkgs.llvm}/bin/llvm-objcopy -O binary --only-section=.lld_repro --set-section-flags .lld_repro=alloc $1 - | ${pkgs.gnutar}/bin/tar x -I ${pkgs.lib.getBin pkgs.buildPackages.lz4}/bin/lz4 --strip-components=1 -C $out/$2
+  }
+
+  extract_reproducer ${targets.aarch64.hello}/bin/hello hello-arm64
+  extract_reproducer ${targets.x86_64.hello}/bin/hello hello-x64
+  extract_reproducer ${targets.aarch64.chrome}/libexec/chromium/chromium chrome
+  extract_reproducer ${targets.aarch64.ladybird}/lib/liblagom-web.so ladybird
+  extract_reproducer ${targets.aarch64.firefox}/lib/firefox/libxul.so firefox-arm64
+  extract_reproducer ${targets.x86_64.firefox}/lib/firefox/libxul.so firefox-x64
+  extract_reproducer ${targets.riscv64.firefox}/lib/firefox/libxul.so firefox-riscv64
+  extract_reproducer ${pkgs.lib.getLib targets.aarch64.llvm}/lib/libLLVM.so llvm-arm64
+  extract_reproducer ${pkgs.lib.getLib targets.x86_64.llvm}/lib/libLLVM.so llvm-x64
+  extract_reproducer ${pkgs.lib.getLib targets.riscv64.llvm}/lib/libLLVM.so llvm-riscv6
+''
diff --git a/lld/utils/speed-test-reproducers/ld-wrapper.sh b/lld/utils/speed-test-reproducers/ld-wrapper.sh
new file mode 100755
index 0000000..8a19d7e2
--- /dev/null
+++ b/lld/utils/speed-test-reproducers/ld-wrapper.sh
@@ -0,0 +1,50 @@
+#! @shell@
+
+source @out@/nix-support/utils.bash
+
+expandResponseParams "$@"
+
+output="a.out"
+should_add_repro=true
+newparams=()
+for arg in "${params[@]}"; do
+  case "$arg" in
+    -r|--version)
+      should_add_repro=false
+      ;;
+    *)
+      ;;
+  esac
+  case "$prev" in
+    -o)
+      output="$arg"
+      newparams+=("$arg")
+      ;;
+    *)
+      if [ -e "$arg.nolldrepro" ]; then
+        newparams+=("$arg.nolldrepro")
+      else
+        newparams+=("$arg")
+      fi
+      ;;
+  esac
+  prev="$arg"
+done
+
+export LLD_REPRODUCE="$output.repro.tar"
+if @targetPrefix@nix-wrap-lld "${newparams[@]}"; then
+  if $should_add_repro; then
+    @lz4@ -c "$LLD_REPRODUCE" > "$LLD_REPRODUCE.lz4"
+    mv "$output" "$output.nolldrepro"
+    @targetPrefix@objcopy --add-section ".lld_repro=$LLD_REPRODUCE.lz4" "$output.nolldrepro" "$output"
+    rm -f "$LLD_REPRODUCE.lz4"
+  fi
+  exitcode=0
+else
+  # Some Nix packages don't link with lld so just use bfd instead.
+  @targetPrefix@ld.bfd "${newparams[@]}"
+  exitcode=$?
+fi
+
+rm -f "$LLD_REPRODUCE"
+exit $exitcode
diff --git a/lldb/bindings/python/python-swigsafecast.swig b/lldb/bindings/python/python-swigsafecast.swig
index 4721dfd..3ea24f1 100644
--- a/lldb/bindings/python/python-swigsafecast.swig
+++ b/lldb/bindings/python/python-swigsafecast.swig
@@ -142,5 +142,9 @@ PythonObject SWIGBridge::ToSWIGWrapper(
   return ToSWIGHelper(module_spec_sb.release(), SWIGTYPE_p_lldb__SBModuleSpec);
 }
 
+PythonObject SWIGBridge::ToSWIGWrapper(lldb::DescriptionLevel level) {
+  return PythonInteger((int64_t) level);
+}
+
 } // namespace python
 } // namespace lldb_private
diff --git a/lldb/bindings/python/python-wrapper.swig b/lldb/bindings/python/python-wrapper.swig
index 2c30d53..64b7dc8 100644
--- a/lldb/bindings/python/python-wrapper.swig
+++ b/lldb/bindings/python/python-wrapper.swig
@@ -422,6 +422,30 @@ void *lldb_private::python::LLDBSWIGPython_CastPyObjectToSBBreakpoint(PyObject *
   return sb_ptr;
 }
 
+void *lldb_private::python::LLDBSWIGPython_CastPyObjectToSBFrame(PyObject * data) {
+  lldb::SBFrame *sb_ptr = nullptr;
+
+  int valid_cast =
+      SWIG_ConvertPtr(data, (void **)&sb_ptr, SWIGTYPE_p_lldb__SBFrame, 0);
+
+  if (valid_cast == -1)
+    return NULL;
+
+  return sb_ptr;
+}
+
+void *lldb_private::python::LLDBSWIGPython_CastPyObjectToSBBreakpointLocation(PyObject * data) {
+  lldb::SBBreakpointLocation *sb_ptr = nullptr;
+
+  int valid_cast =
+      SWIG_ConvertPtr(data, (void **)&sb_ptr, SWIGTYPE_p_lldb__SBBreakpointLocation, 0);
+
+  if (valid_cast == -1)
+    return NULL;
+
+  return sb_ptr;
+}
+
 void *lldb_private::python::LLDBSWIGPython_CastPyObjectToSBAttachInfo(PyObject * data) {
   lldb::SBAttachInfo *sb_ptr = nullptr;
 
diff --git a/lldb/cmake/modules/AddLLDB.cmake b/lldb/cmake/modules/AddLLDB.cmake
index 28bf8d8..bdd6f73 100644
--- a/lldb/cmake/modules/AddLLDB.cmake
+++ b/lldb/cmake/modules/AddLLDB.cmake
@@ -167,6 +167,12 @@ function(add_lldb_executable name)
   )
 
   target_link_libraries(${name} PRIVATE ${ARG_LINK_LIBS})
+  if(WIN32)
+    list(FIND ARG_LINK_LIBS liblldb LIBLLDB_INDEX)
+    if(NOT LIBLLDB_INDEX EQUAL -1)
+      target_link_options(${name} PRIVATE "/DELAYLOAD:$<TARGET_FILE_BASE_NAME:liblldb>.dll")
+    endif()
+  endif()
   if(CLANG_LINK_CLANG_DYLIB)
     target_link_libraries(${name} PRIVATE clang-cpp)
   else()
diff --git a/lldb/docs/use/python-reference.rst b/lldb/docs/use/python-reference.rst
index 6ac2ec9..afca075 100644
--- a/lldb/docs/use/python-reference.rst
+++ b/lldb/docs/use/python-reference.rst
@@ -27,4 +27,4 @@ The following tutorials and documentation demonstrate various Python capabilitie
    tutorials/writing-custom-commands
    tutorials/implementing-standalone-scripts
    tutorials/custom-frame-recognizers
-   tutorials/extending-target-stop-hooks
-\ No newline at end of file
+   tutorials/extending-target-stop-hooks
diff --git a/lldb/docs/use/tutorials/creating-custom-breakpoints.md b/lldb/docs/use/tutorials/creating-custom-breakpoints.md
index e3081c4..04673c3 100644
--- a/lldb/docs/use/tutorials/creating-custom-breakpoints.md
+++ b/lldb/docs/use/tutorials/creating-custom-breakpoints.md
@@ -125,4 +125,48 @@ you can use for this purpose. Your __init__ function gets passed this
 SBStructuredData object. This API also allows you to directly provide the list
 of Modules and the list of CompileUnits that will make up the SearchFilter. If
 you pass in empty lists, the breakpoint will use the default "search
-everywhere,accept everything" filter.
-\ No newline at end of file
+everywhere,accept everything" filter.
+
+### Providing Facade Locations:
+
+The breakpoint resolver interface also allows you to present a separate set
+of locations for the breakpoint than the ones that actually implement the
+breakpoint in the target.
+
+An example use case for this is if you are providing a debugging interface for a
+library that implements an interpreter for a language lldb can't debug.  But
+while debugging that library at the level of the implementation language (e.g. C/C++, etc)
+you would like to offer the ability to "stop when a line in a source language
+file is executed".
+
+You can do this if you know where new lines of code are dispatched in the
+interpreter.  You would set a breakpoint there, and then look at the state
+when that breakpoint is hit to see if it is dispatching the source file and
+line that were requested, and stop appropriately.
+
+Facade breakpoint locations are intended to make a more natural presentation
+of that sort of feature.  The idea is that you would make a custom breakpoint
+resolver that sets actual locations in the places of interest in the interpreter.
+
+Then your resolver would add "facade locations" that represent the places in the
+interpreted code that you want the breakpoint to stop at, using SBBreakpoint::AddFacadeLocation.
+When lldb describes the breakpoint, it will only show the Facade locations.
+Since facade breakpoint location's description is customizable, you can make these
+locations more descriptive.  And when the "real" location is hit, lldb will call the
+"was_hit" method of your resolver.  That will return the facade location you
+consider to have been hit this time around, or if you return None, the breakpoint
+will be considered not to have been hit.
+
+Note, this feature is also useful if you don't intend to present facade
+locations since it essentially provides a scripted breakpoint condition.  Every
+time one of the locations in your breakpoint is hit, you can run the code in
+your "was_hit" to determine whether to consider the breakpoint hit or not, and
+return the location you were passed in if you want it to be a hit, and None if not.
+
+The Facade location adds these optional affordances to the Resolver class:
+
+| Name  | Arguments | Description|
+|-------|-----------|------------|
+|`was_hit`| `frame`:`lldb.SBFrame` `bp_loc`:`lldb.SBBreakpointLocation` | This will get called when one of the "real" locations set by your resolver is hit.  `frame` is the stack frame that hit this location.  `bp_loc` is the real location that was hit.  Return either the facade location that you want to consider hit on this stop, or None if you don't consider any of your facade locations to have been hit. |
+| `get_location_description` | `bp_loc`:`lldb.SBBreakpointLocation` `desc_level`:`lldb.DescriptionLevel` `bp_loc` is the facade location to describe.| Use this to provide a helpful description of each facade location.  ``desc_level`` is the level of description requested.  The Brief description is printed when the location is hit.  Full is printed for `break list` and Verbose for `break list -v`.|
+
diff --git a/lldb/include/lldb/API/SBBreakpoint.h b/lldb/include/lldb/API/SBBreakpoint.h
index 18ed3e7..fe19ba9 100644
--- a/lldb/include/lldb/API/SBBreakpoint.h
+++ b/lldb/include/lldb/API/SBBreakpoint.h
@@ -153,9 +153,15 @@ public:
   /// fails, e.g. when there aren't enough hardware resources available.
   lldb::SBError SetIsHardware(bool is_hardware);
 
-  // Can only be called from a ScriptedBreakpointResolver...
+  /// Adds a location to the breakpoint at the address passed in.
+  /// Can only be called from a ScriptedBreakpointResolver...
   SBError
   AddLocation(SBAddress &address);
+  /// Add a "Facade location" to the breakpoint.  This returns the Facade
+  /// Location that was added, which you can then use in
+  /// get_location_description and was_hit in your breakpoint resolver.
+  /// Can only be called from a ScriptedBreakpointResolver.
+  SBBreakpointLocation AddFacadeLocation();
 
   SBStructuredData SerializeToStructuredData();
 
diff --git a/lldb/include/lldb/API/SBBreakpointLocation.h b/lldb/include/lldb/API/SBBreakpointLocation.h
index fa823e2..9b0d483 100644
--- a/lldb/include/lldb/API/SBBreakpointLocation.h
+++ b/lldb/include/lldb/API/SBBreakpointLocation.h
@@ -24,6 +24,8 @@ class SWIGBridge;
 namespace lldb {
 
 class LLDB_API SBBreakpointLocation {
+  friend class lldb_private::ScriptInterpreter;
+
 public:
   SBBreakpointLocation();
 
diff --git a/lldb/include/lldb/API/SBFrame.h b/lldb/include/lldb/API/SBFrame.h
index e4bbcd5..92917e5 100644
--- a/lldb/include/lldb/API/SBFrame.h
+++ b/lldb/include/lldb/API/SBFrame.h
@@ -226,6 +226,7 @@ protected:
   friend class SBThread;
   friend class SBValue;
 
+  friend class lldb_private::ScriptInterpreter;
   friend class lldb_private::python::SWIGBridge;
   friend class lldb_private::lua::SWIGBridge;
 
diff --git a/lldb/include/lldb/Breakpoint/Breakpoint.h b/lldb/include/lldb/Breakpoint/Breakpoint.h
index 26a5e90..c0bad98 100644
--- a/lldb/include/lldb/Breakpoint/Breakpoint.h
+++ b/lldb/include/lldb/Breakpoint/Breakpoint.h
@@ -248,6 +248,23 @@ public:
   ///    Returns a pointer to the new location.
   lldb::BreakpointLocationSP AddLocation(const Address &addr,
                                          bool *new_location = nullptr);
+  /// Add a `facade` location to the breakpoint's collection of facade
+  /// locations. This is only meant to be called by the breakpoint's resolver.
+  /// Facade locations are placeholders that a scripted breakpoint can use to
+  /// represent the stop locations provided by the breakpoint.  The scripted
+  /// breakpoint should record the id of the facade location, and provide
+  /// the description of the location in the GetDescription method
+  /// To emulate hitting a facade location, the breakpoint's WasHit should
+  /// return the ID of the facade that was "hit".
+  ///
+  /// \param[out] new_location
+  ///    Set to \b true if a new location was created, to \b false if there
+  ///    already was a location at this Address.
+  /// \return
+  ///    Returns a pointer to the new location.
+  lldb::BreakpointLocationSP AddFacadeLocation();
+
+  lldb::BreakpointLocationSP GetFacadeLocationByID(lldb::break_id_t);
 
   /// Find a breakpoint location by Address.
   ///
@@ -268,27 +285,38 @@ public:
   ///    there is no breakpoint location at that address.
   lldb::break_id_t FindLocationIDByAddress(const Address &addr);
 
-  /// Find a breakpoint location for a given breakpoint location ID.
+  /// Find a breakpoint location for a given breakpoint location ID.  If there
+  /// are Facade Locations in the breakpoint, the facade locations will be
+  /// searched instead of the "real" ones.
   ///
   /// \param[in] bp_loc_id
   ///    The ID specifying the location.
+  ///
+  /// \param[in] use_facade
+  /// If \b true, then prefer facade locations over "real" ones if they exist.
+  ///
   /// \return
   ///    Returns a shared pointer to the location with ID \a bp_loc_id.  The
   ///    pointer
   ///    in the shared pointer will be nullptr if there is no location with that
   ///    ID.
-  lldb::BreakpointLocationSP FindLocationByID(lldb::break_id_t bp_loc_id);
+  lldb::BreakpointLocationSP FindLocationByID(lldb::break_id_t bp_loc_id,
+                                              bool use_facade = true);
 
   /// Get breakpoint locations by index.
   ///
   /// \param[in] index
   ///    The location index.
   ///
+  /// \param[in] use_facade
+  /// If \b true, then prefer facade locations over "real" ones if they exist.
+  ///
   /// \return
   ///     Returns a shared pointer to the location with index \a
   ///     index. The shared pointer might contain nullptr if \a index is
   ///     greater than then number of actual locations.
-  lldb::BreakpointLocationSP GetLocationAtIndex(size_t index);
+  lldb::BreakpointLocationSP GetLocationAtIndex(size_t index,
+                                                bool use_facade = true);
 
   /// Removes all invalid breakpoint locations.
   ///
@@ -409,9 +437,12 @@ public:
   /// Return the number of breakpoint locations that have resolved to actual
   /// breakpoint sites.
   ///
+  /// \param[in] use_facade
+  /// If \b true, then prefer facade locations over "real" ones if they exist.
+  ///
   /// \return
   ///     The number locations resolved breakpoint sites.
-  size_t GetNumResolvedLocations() const;
+  size_t GetNumResolvedLocations(bool use_facade = true) const;
 
   /// Return whether this breakpoint has any resolved locations.
   ///
@@ -421,9 +452,12 @@ public:
 
   /// Return the number of breakpoint locations.
   ///
+  /// \param[in] use_facade
+  /// If \b true, then prefer facade locations over "real" ones if they exist.
+  ///
   /// \return
   ///     The number breakpoint locations.
-  size_t GetNumLocations() const;
+  size_t GetNumLocations(bool use_facade = true) const;
 
   /// Put a description of this breakpoint into the stream \a s.
   ///
@@ -529,6 +563,19 @@ private:
       m_name_list.erase(name_to_remove);
   }
 
+  /// This controls whether to display information about
+  /// the facade locations or the real locations.
+  enum DisplayType {
+    eDisplayFacade = 1,     // Display facade locations
+    eDisplayReal = 1 << 1,  // Display real locations
+    eDisplayHeader = 1 << 2 // Display compressed list of locations only
+  };
+
+  void GetDescriptionForType(Stream *s, lldb::DescriptionLevel level,
+                             uint8_t display_type, bool show_locations);
+
+  bool HasFacadeLocations() { return m_facade_locations.GetSize() != 0; }
+
 public:
   bool MatchesName(const char *name) {
     return m_name_list.find(name) != m_name_list.end();
@@ -657,6 +704,8 @@ private:
   BreakpointOptions m_options; // Settable breakpoint options
   BreakpointLocationList
       m_locations; // The list of locations currently found for this breakpoint.
+  BreakpointLocationCollection m_facade_locations;
+
   std::string m_kind_description;
   bool m_resolve_indirect_symbols;
 
diff --git a/lldb/include/lldb/Breakpoint/BreakpointLocation.h b/lldb/include/lldb/Breakpoint/BreakpointLocation.h
index ab2e5e1..c44ff47 100644
--- a/lldb/include/lldb/Breakpoint/BreakpointLocation.h
+++ b/lldb/include/lldb/Breakpoint/BreakpointLocation.h
@@ -38,6 +38,12 @@ namespace lldb_private {
 
 class BreakpointLocation
     : public std::enable_shared_from_this<BreakpointLocation> {
+  friend class BreakpointSite;
+  friend class BreakpointLocationList;
+  friend class Breakpoint;
+  friend class Process;
+  friend class StopInfoBreakpoint;
+
 public:
   ~BreakpointLocation();
 
@@ -55,16 +61,39 @@ public:
 
   Target &GetTarget();
 
+  /// This is a programmatic version of a breakpoint "condition".  When a
+  /// breakpoint is hit, WasHit will get called before the synchronous
+  /// ShouldStop callback is run, and if it returns an empty
+  /// BreakpointLocationSP, lldb will act as if that breakpoint wasn't hit.
+  ///
+  /// \param[in] context
+  ///   The context at the stop point
+  ///
+  /// \return
+  ///    This will return the breakpoint location that was hit on this stop.
+  ///    If there was no facade location this will be the original location.
+  ///    If the shared pointer is empty, then we'll treat it as if the
+  ///    breakpoint was not hit.
+  lldb::BreakpointLocationSP WasHit(StoppointCallbackContext *context);
+
   /// Determines whether we should stop due to a hit at this breakpoint
   /// location.
   ///
   /// Side Effects: This may evaluate the breakpoint condition, and run the
   /// callback.  So this command may do a considerable amount of work.
   ///
+  /// \param[in] context
+  ///   The context at the stop point
+  ///
+  /// \param[out] facade_loc_sp
+  ///   If this stop should be attributed not to the location that was hit, but
+  ///   to a facade location, it will be returned in this facade_loc_sp.
+  ///
   /// \return
   ///     \b true if this breakpoint location thinks we should stop,
   ///     \b false otherwise.
-  bool ShouldStop(StoppointCallbackContext *context);
+  bool ShouldStop(StoppointCallbackContext *context,
+                  lldb::BreakpointLocationSP &facade_loc_sp);
 
   // The next section deals with various breakpoint options.
 
@@ -292,11 +321,6 @@ public:
   }
 
 protected:
-  friend class BreakpointSite;
-  friend class BreakpointLocationList;
-  friend class Process;
-  friend class StopInfoBreakpoint;
-
   /// Set the breakpoint site for this location to \a bp_site_sp.
   ///
   /// \param[in] bp_site_sp
@@ -346,9 +370,11 @@ private:
   // Constructors and Destructors
   //
   // Only the Breakpoint can make breakpoint locations, and it owns them.
-
   /// Constructor.
   ///
+  /// \param[in] loc_id
+  ///     The location id of the new location.
+  ///
   /// \param[in] owner
   ///     A back pointer to the breakpoint that owns this location.
   ///
@@ -359,37 +385,65 @@ private:
   ///     The thread for which this breakpoint location is valid, or
   ///     LLDB_INVALID_THREAD_ID if it is valid for all threads.
   ///
-  BreakpointLocation(lldb::break_id_t bid, Breakpoint &owner,
+  BreakpointLocation(lldb::break_id_t loc_id, Breakpoint &owner,
                      const Address &addr, lldb::tid_t tid,
                      bool check_for_resolver = true);
 
+  /// This is the constructor for locations with no address.  Currently this is
+  /// just used for Facade locations.
+  ///
+  /// \param[in] loc_id
+  ///     The location id of the new location.
+  ///
+  /// \param[in] owner
+  ///     A back pointer to the breakpoint that owns this location.
+  ///
+  ///
+public:
+  BreakpointLocation(lldb::break_id_t loc_id, Breakpoint &owner);
+  bool IsValid() const { return m_is_valid; }
+  bool IsFacade() const { return m_is_facade; }
+
+private:
   // Data members:
   bool m_should_resolve_indirect_functions;
   bool m_is_reexported;
   bool m_is_indirect;
-  Address m_address;   ///< The address defining this location.
-  Breakpoint &m_owner; ///< The breakpoint that produced this object.
-  std::unique_ptr<BreakpointOptions> m_options_up; ///< Breakpoint options
-                                                   /// pointer, nullptr if we're
-                                                   /// using our breakpoint's
-                                                   /// options.
-  lldb::BreakpointSiteSP m_bp_site_sp; ///< Our breakpoint site (it may be
-                                       ///shared by more than one location.)
-  lldb::UserExpressionSP m_user_expression_sp; ///< The compiled expression to
-                                               ///use in testing our condition.
-  std::mutex m_condition_mutex; ///< Guards parsing and evaluation of the
-                                ///condition, which could be evaluated by
-                                /// multiple processes.
-  size_t m_condition_hash; ///< For testing whether the condition source code
-                           ///changed.
-  lldb::break_id_t m_loc_id; ///< Breakpoint location ID.
-  StoppointHitCounter m_hit_counter; ///< Number of times this breakpoint
-                                     /// location has been hit.
+  ///< The address defining this location.
+  Address m_address;
+  ///< The breakpoint that produced this object.
+  Breakpoint &m_owner;
+  ///< Breakpoint options pointer, nullptr if we're using our breakpoint's
+  /// options.
+  std::unique_ptr<BreakpointOptions> m_options_up;
+  ///< Our breakpoint site (it may be shared by more than one location.)
+  lldb::BreakpointSiteSP m_bp_site_sp;
+  ///< The compiled expression to use in testing our condition.
+  lldb::UserExpressionSP m_user_expression_sp;
+  ///< Guards parsing and evaluation of the condition, which could be evaluated
+  /// by multiple processes.
+  std::mutex m_condition_mutex;
+  ///< For testing whether the condition source code changed.
+  size_t m_condition_hash;
+  ///< Breakpoint location ID.
+  lldb::break_id_t m_loc_id;
+  ///< Number of times this breakpoint location has been hit.
+  StoppointHitCounter m_hit_counter;
   /// If this exists, use it to print the stop description rather than the
   /// LineEntry m_address resolves to directly.  Use this for instance when the
   /// location was given somewhere in the virtual inlined call stack since the
   /// Address always resolves to the lowest entry in the stack.
   std::optional<LineEntry> m_preferred_line_entry;
+  /// Because Facade locations don't have sites we can't use the presence of
+  /// the site to mean this breakpoint is valid, but must manage the state
+  /// directly.
+  bool m_is_valid = true;
+  /// Facade locations aren't directly triggered and don't have a breakpoint
+  /// site.  They are a useful fiction when you want to represent the stop
+  /// location as something lldb can't naturally stop at.
+  bool m_is_facade = false;
+
+  void SetInvalid() { m_is_valid = false; }
 
   void SetShouldResolveIndirectFunctions(bool do_resolve) {
     m_should_resolve_indirect_functions = do_resolve;
diff --git a/lldb/include/lldb/Breakpoint/BreakpointLocationCollection.h b/lldb/include/lldb/Breakpoint/BreakpointLocationCollection.h
index 3aef1d6..1df4e0746 100644
--- a/lldb/include/lldb/Breakpoint/BreakpointLocationCollection.h
+++ b/lldb/include/lldb/Breakpoint/BreakpointLocationCollection.h
@@ -111,7 +111,8 @@ public:
   ///
   /// \return
   ///    \b true if we should stop, \b false otherwise.
-  bool ShouldStop(StoppointCallbackContext *context);
+  bool ShouldStop(StoppointCallbackContext *context,
+                  BreakpointLocationCollection &stopped_bp_locs);
 
   /// Print a description of the breakpoint locations in this list
   /// to the stream \a s.
diff --git a/lldb/include/lldb/Breakpoint/BreakpointLocationList.h b/lldb/include/lldb/Breakpoint/BreakpointLocationList.h
index 17dc0bf..952db55 100644
--- a/lldb/include/lldb/Breakpoint/BreakpointLocationList.h
+++ b/lldb/include/lldb/Breakpoint/BreakpointLocationList.h
@@ -140,7 +140,8 @@ public:
   ///
   /// \return
   ///     \b true if we should stop, \b false otherwise.
-  bool ShouldStop(StoppointCallbackContext *context, lldb::break_id_t breakID);
+  bool ShouldStop(StoppointCallbackContext *context, lldb::break_id_t breakID,
+                  lldb::BreakpointLocationSP &bp_loc_sp);
 
   /// Returns the number of elements in this breakpoint location list.
   ///
diff --git a/lldb/include/lldb/Breakpoint/BreakpointResolverScripted.h b/lldb/include/lldb/Breakpoint/BreakpointResolverScripted.h
index 0322fd9..c3c1c80 100644
--- a/lldb/include/lldb/Breakpoint/BreakpointResolverScripted.h
+++ b/lldb/include/lldb/Breakpoint/BreakpointResolverScripted.h
@@ -45,6 +45,13 @@ public:
 
   void GetDescription(Stream *s) override;
 
+  lldb::BreakpointLocationSP WasHit(lldb::StackFrameSP frame_sp,
+                                    lldb::BreakpointLocationSP bp_loc_sp);
+
+  std::optional<std::string>
+  GetLocationDescription(lldb::BreakpointLocationSP bp_loc_sp,
+                         lldb::DescriptionLevel level);
+
   void Dump(Stream *s) const override;
 
   /// Methods for support type inquiry through isa, cast, and dyn_cast:
diff --git a/lldb/include/lldb/Breakpoint/BreakpointSite.h b/lldb/include/lldb/Breakpoint/BreakpointSite.h
index 7b3f7be..a935b24 100644
--- a/lldb/include/lldb/Breakpoint/BreakpointSite.h
+++ b/lldb/include/lldb/Breakpoint/BreakpointSite.h
@@ -99,7 +99,8 @@ public:
   ///
   /// \return
   ///    \b true if we should stop, \b false otherwise.
-  bool ShouldStop(StoppointCallbackContext *context) override;
+  bool ShouldStop(StoppointCallbackContext *context,
+                  BreakpointLocationCollection &stopping_bp_loc) override;
 
   /// Standard Dump method
   void Dump(Stream *s) const override;
diff --git a/lldb/include/lldb/Breakpoint/StopPointSiteList.h b/lldb/include/lldb/Breakpoint/StopPointSiteList.h
index 7ed53e9..101eccd 100644
--- a/lldb/include/lldb/Breakpoint/StopPointSiteList.h
+++ b/lldb/include/lldb/Breakpoint/StopPointSiteList.h
@@ -213,30 +213,6 @@ public:
 
   typedef void (*StopPointSiteSPMapFunc)(StopPointSite &site, void *baton);
 
-  /// Enquires of the site on in this list with ID \a site_id
-  /// whether we should stop for the constituent or not.
-  ///
-  /// \param[in] context
-  ///    This contains the information about this stop.
-  ///
-  /// \param[in] site_id
-  ///    This site ID that we hit.
-  ///
-  /// \return
-  ///    \b true if we should stop, \b false otherwise.
-  bool ShouldStop(StoppointCallbackContext *context,
-                  typename StopPointSite::SiteID site_id) {
-    if (StopPointSiteSP site_sp = FindByID(site_id)) {
-      // Let the site decide if it should stop here (could not have
-      // reached it's target hit count yet, or it could have a callback that
-      // decided it shouldn't stop (shared library loads/unloads).
-      return site_sp->ShouldStop(context);
-    }
-    // We should stop here since this site isn't valid anymore or it
-    // doesn't exist.
-    return true;
-  }
-
   /// Returns the number of elements in the list.
   ///
   /// \result
diff --git a/lldb/include/lldb/Breakpoint/StoppointSite.h b/lldb/include/lldb/Breakpoint/StoppointSite.h
index bef19f3..2ceac40 100644
--- a/lldb/include/lldb/Breakpoint/StoppointSite.h
+++ b/lldb/include/lldb/Breakpoint/StoppointSite.h
@@ -38,7 +38,12 @@ public:
 
   virtual bool IsHardware() const = 0;
 
-  virtual bool ShouldStop(StoppointCallbackContext* context) = 0;
+  virtual bool ShouldStop(StoppointCallbackContext *context) { return false; };
+
+  virtual bool ShouldStop(StoppointCallbackContext *context,
+                          BreakpointLocationCollection &stopping_bp_locs) {
+    return false;
+  };
 
   virtual void Dump(Stream* stream) const = 0;
 
diff --git a/lldb/include/lldb/Interpreter/Interfaces/ScriptedBreakpointInterface.h b/lldb/include/lldb/Interpreter/Interfaces/ScriptedBreakpointInterface.h
index 28d6ed9..d29fd81 100644
--- a/lldb/include/lldb/Interpreter/Interfaces/ScriptedBreakpointInterface.h
+++ b/lldb/include/lldb/Interpreter/Interfaces/ScriptedBreakpointInterface.h
@@ -26,6 +26,16 @@ public:
   virtual bool ResolverCallback(SymbolContext sym_ctx) { return true; }
   virtual lldb::SearchDepth GetDepth() { return lldb::eSearchDepthModule; }
   virtual std::optional<std::string> GetShortHelp() { return nullptr; }
+  /// WasHit returns the breakpoint location SP for the location that was "hit".
+  virtual lldb::BreakpointLocationSP
+  WasHit(lldb::StackFrameSP frame_sp, lldb::BreakpointLocationSP bp_loc_sp) {
+    return LLDB_INVALID_BREAK_ID;
+  }
+  virtual std::optional<std::string>
+  GetLocationDescription(lldb::BreakpointLocationSP bp_loc_sp,
+                         lldb::DescriptionLevel level) {
+    return {};
+  }
 };
 } // namespace lldb_private
 
diff --git a/lldb/include/lldb/Interpreter/ScriptInterpreter.h b/lldb/include/lldb/Interpreter/ScriptInterpreter.h
index 024bbc9..6c0054a 100644
--- a/lldb/include/lldb/Interpreter/ScriptInterpreter.h
+++ b/lldb/include/lldb/Interpreter/ScriptInterpreter.h
@@ -11,6 +11,7 @@
 
 #include "lldb/API/SBAttachInfo.h"
 #include "lldb/API/SBBreakpoint.h"
+#include "lldb/API/SBBreakpointLocation.h"
 #include "lldb/API/SBData.h"
 #include "lldb/API/SBError.h"
 #include "lldb/API/SBEvent.h"
@@ -572,12 +573,17 @@ public:
 
   lldb::StreamSP GetOpaqueTypeFromSBStream(const lldb::SBStream &stream) const;
 
+  lldb::StackFrameSP GetOpaqueTypeFromSBFrame(const lldb::SBFrame &frame) const;
+
   SymbolContext
   GetOpaqueTypeFromSBSymbolContext(const lldb::SBSymbolContext &sym_ctx) const;
 
   lldb::BreakpointSP
   GetOpaqueTypeFromSBBreakpoint(const lldb::SBBreakpoint &breakpoint) const;
 
+  lldb::BreakpointLocationSP GetOpaqueTypeFromSBBreakpointLocation(
+      const lldb::SBBreakpointLocation &break_loc) const;
+
   lldb::ProcessAttachInfoSP
   GetOpaqueTypeFromSBAttachInfo(const lldb::SBAttachInfo &attach_info) const;
 
diff --git a/lldb/source/API/SBBreakpoint.cpp b/lldb/source/API/SBBreakpoint.cpp
index 23dba46..04af7a3 100644
--- a/lldb/source/API/SBBreakpoint.cpp
+++ b/lldb/source/API/SBBreakpoint.cpp
@@ -574,6 +574,15 @@ SBError SBBreakpoint::AddLocation(SBAddress &address) {
   return error;
 }
 
+SBBreakpointLocation SBBreakpoint::AddFacadeLocation() {
+  BreakpointSP bkpt_sp = GetSP();
+  if (!bkpt_sp)
+    return {};
+
+  BreakpointLocationSP loc_sp = bkpt_sp->AddFacadeLocation();
+  return SBBreakpointLocation(loc_sp);
+}
+
 SBStructuredData SBBreakpoint::SerializeToStructuredData() {
   LLDB_INSTRUMENT_VA(this);
 
diff --git a/lldb/source/Breakpoint/Breakpoint.cpp b/lldb/source/Breakpoint/Breakpoint.cpp
index 1544bf8..b23d114 100644
--- a/lldb/source/Breakpoint/Breakpoint.cpp
+++ b/lldb/source/Breakpoint/Breakpoint.cpp
@@ -58,7 +58,13 @@ Breakpoint::Breakpoint(Target &new_target, const Breakpoint &source_bp)
       m_hit_counter() {}
 
 // Destructor
-Breakpoint::~Breakpoint() = default;
+Breakpoint::~Breakpoint() {
+  for (BreakpointLocationSP location_sp : m_locations.BreakpointLocations())
+    location_sp->SetInvalid();
+  for (BreakpointLocationSP location_sp :
+       m_facade_locations.BreakpointLocations())
+    location_sp->SetInvalid();
+}
 
 BreakpointSP Breakpoint::CopyFromBreakpoint(TargetSP new_target,
                                             const Breakpoint &bp_to_copy_from) {
@@ -302,6 +308,20 @@ BreakpointLocationSP Breakpoint::AddLocation(const Address &addr,
                                  new_location);
 }
 
+BreakpointLocationSP Breakpoint::AddFacadeLocation() {
+  size_t next_id = m_facade_locations.GetSize() + 1;
+  BreakpointLocationSP break_loc_sp =
+      std::make_shared<BreakpointLocation>(next_id, *this);
+  break_loc_sp->m_is_facade = true;
+  m_facade_locations.Add(break_loc_sp);
+  return break_loc_sp;
+}
+
+BreakpointLocationSP
+Breakpoint::GetFacadeLocationByID(lldb::break_id_t loc_id) {
+  return m_facade_locations.GetByIndex(loc_id - 1);
+}
+
 BreakpointLocationSP Breakpoint::FindLocationByAddress(const Address &addr) {
   return m_locations.FindByAddress(addr);
 }
@@ -310,15 +330,23 @@ break_id_t Breakpoint::FindLocationIDByAddress(const Address &addr) {
   return m_locations.FindIDByAddress(addr);
 }
 
-BreakpointLocationSP Breakpoint::FindLocationByID(break_id_t bp_loc_id) {
+BreakpointLocationSP Breakpoint::FindLocationByID(break_id_t bp_loc_id,
+                                                  bool use_facade) {
+  if (use_facade && m_facade_locations.GetSize())
+    return GetFacadeLocationByID(bp_loc_id);
   return m_locations.FindByID(bp_loc_id);
 }
 
-BreakpointLocationSP Breakpoint::GetLocationAtIndex(size_t index) {
+BreakpointLocationSP Breakpoint::GetLocationAtIndex(size_t index,
+                                                    bool use_facade) {
+  if (use_facade && m_facade_locations.GetSize() > 0)
+    return m_facade_locations.GetByIndex(index);
   return m_locations.GetByIndex(index);
 }
 
 void Breakpoint::RemoveInvalidLocations(const ArchSpec &arch) {
+  // FIXME: Should we ask the scripted resolver whether any of its facade
+  // locations are invalid?
   m_locations.RemoveInvalidLocations(arch);
 }
 
@@ -864,9 +892,15 @@ void Breakpoint::ModuleReplaced(ModuleSP old_module_sp,
 
 void Breakpoint::Dump(Stream *) {}
 
-size_t Breakpoint::GetNumResolvedLocations() const {
+size_t Breakpoint::GetNumResolvedLocations(bool use_facade) const {
   // Return the number of breakpoints that are actually resolved and set down
   // in the inferior process.
+  // All facade locations are considered to be resolved:
+  if (use_facade) {
+    size_t num_facade_locs = m_facade_locations.GetSize();
+    if (num_facade_locs)
+      return num_facade_locs;
+  }
   return m_locations.GetNumResolvedLocations();
 }
 
@@ -874,7 +908,14 @@ bool Breakpoint::HasResolvedLocations() const {
   return GetNumResolvedLocations() > 0;
 }
 
-size_t Breakpoint::GetNumLocations() const { return m_locations.GetSize(); }
+size_t Breakpoint::GetNumLocations(bool use_facade) const {
+  if (use_facade) {
+    size_t num_facade_locs = m_facade_locations.GetSize();
+    if (num_facade_locs > 0)
+      return num_facade_locs;
+  }
+  return m_locations.GetSize();
+}
 
 void Breakpoint::AddName(llvm::StringRef new_name) {
   m_name_list.insert(new_name.str());
@@ -899,8 +940,31 @@ void Breakpoint::GetDescription(Stream *s, lldb::DescriptionLevel level,
     s->Printf("Kind: %s\n", GetBreakpointKind());
   }
 
-  const size_t num_locations = GetNumLocations();
-  const size_t num_resolved_locations = GetNumResolvedLocations();
+  bool show_both_types = level == eDescriptionLevelVerbose &&
+                         HasFacadeLocations() && show_locations;
+  uint8_t display_mask = eDisplayFacade;
+  if (show_both_types)
+    display_mask |= eDisplayHeader;
+
+  GetDescriptionForType(s, level, display_mask, show_locations);
+
+  if (show_both_types) {
+    display_mask = eDisplayReal | eDisplayHeader;
+    GetDescriptionForType(s, level, display_mask, show_locations);
+  }
+  // Reset the colors back to normal if they were previously greyed out.
+  if (dim_breakpoint_description)
+    s->Printf("%s", ansi::FormatAnsiTerminalCodes(
+                        GetTarget().GetDebugger().GetDisabledAnsiSuffix())
+                        .c_str());
+}
+
+void Breakpoint::GetDescriptionForType(Stream *s, lldb::DescriptionLevel level,
+                                       uint8_t display_type,
+                                       bool show_locations) {
+  bool use_facade = (display_type & eDisplayFacade) != 0;
+  const size_t num_locations = GetNumLocations(use_facade);
+  const size_t num_resolved_locations = GetNumResolvedLocations(use_facade);
 
   // They just made the breakpoint, they don't need to be told HOW they made
   // it... Also, we'll print the breakpoint number differently depending on
@@ -957,7 +1021,7 @@ void Breakpoint::GetDescription(Stream *s, lldb::DescriptionLevel level,
     } else if (num_locations == 1 && !show_locations) {
       // There is only one location, so we'll just print that location
       // information.
-      GetLocationAtIndex(0)->GetDescription(s, level);
+      GetLocationAtIndex(0, use_facade)->GetDescription(s, level);
     } else {
       s->Printf("%" PRIu64 " locations.", static_cast<uint64_t>(num_locations));
     }
@@ -979,20 +1043,20 @@ void Breakpoint::GetDescription(Stream *s, lldb::DescriptionLevel level,
   // The brief description is just the location name (1.2 or whatever).  That's
   // pointless to show in the breakpoint's description, so suppress it.
   if (show_locations && level != lldb::eDescriptionLevelBrief) {
+    if ((display_type & eDisplayHeader) != 0) {
+      if ((display_type & eDisplayFacade) != 0)
+        s->Printf("Facade locations:\n");
+      else
+        s->Printf("Implementation Locations\n");
+    }
     s->IndentMore();
     for (size_t i = 0; i < num_locations; ++i) {
-      BreakpointLocation *loc = GetLocationAtIndex(i).get();
+      BreakpointLocation *loc = GetLocationAtIndex(i, use_facade).get();
       loc->GetDescription(s, level);
       s->EOL();
     }
     s->IndentLess();
   }
-
-  // Reset the colors back to normal if they were previously greyed out.
-  if (dim_breakpoint_description)
-    s->Printf("%s", ansi::FormatAnsiTerminalCodes(
-                        GetTarget().GetDebugger().GetDisabledAnsiSuffix())
-                        .c_str());
 }
 
 void Breakpoint::GetResolverDescription(Stream *s) {
diff --git a/lldb/source/Breakpoint/BreakpointLocation.cpp b/lldb/source/Breakpoint/BreakpointLocation.cpp
index 443d4f5..22c98ac 100644
--- a/lldb/source/Breakpoint/BreakpointLocation.cpp
+++ b/lldb/source/Breakpoint/BreakpointLocation.cpp
@@ -8,6 +8,8 @@
 
 #include "lldb/Breakpoint/BreakpointLocation.h"
 #include "lldb/Breakpoint/BreakpointID.h"
+#include "lldb/Breakpoint/BreakpointResolver.h"
+#include "lldb/Breakpoint/BreakpointResolverScripted.h"
 #include "lldb/Breakpoint/StoppointCallbackContext.h"
 #include "lldb/Core/Debugger.h"
 #include "lldb/Core/Module.h"
@@ -45,6 +47,13 @@ BreakpointLocation::BreakpointLocation(break_id_t loc_id, Breakpoint &owner,
   SetThreadIDInternal(tid);
 }
 
+BreakpointLocation::BreakpointLocation(break_id_t loc_id, Breakpoint &owner)
+    : m_should_resolve_indirect_functions(false), m_is_reexported(false),
+      m_is_indirect(false), m_address(LLDB_INVALID_ADDRESS), m_owner(owner),
+      m_condition_hash(0), m_loc_id(loc_id), m_hit_counter() {
+  SetThreadIDInternal(LLDB_INVALID_THREAD_ID);
+}
+
 BreakpointLocation::~BreakpointLocation() {
   llvm::consumeError(ClearBreakpointSite());
 }
@@ -372,12 +381,43 @@ bool BreakpointLocation::ValidForThisThread(Thread &thread) {
           .GetThreadSpecNoCreate());
 }
 
+BreakpointLocationSP
+BreakpointLocation::WasHit(StoppointCallbackContext *context) {
+  // Only the BreakpointResolverScripted provides WasHit.
+  BreakpointResolverSP resolver_sp = GetBreakpoint().GetResolver();
+  BreakpointResolverScripted *scripted =
+      llvm::dyn_cast<BreakpointResolverScripted>(resolver_sp.get());
+  if (!scripted)
+    return shared_from_this();
+
+  StackFrameSP frame_sp = context->exe_ctx_ref.GetFrameSP();
+  if (!frame_sp)
+    return shared_from_this();
+
+  BreakpointLocationSP return_loc_sp =
+      scripted->WasHit(frame_sp, shared_from_this());
+  // If this is a facade location, then we won't have bumped its hit count
+  // while processing the original location hit.  Do so here.  We don't need
+  // to bump the breakpoint's hit count, however, since hitting the real
+  // location would have already done that.
+  // Also we have to check the enabled state here, since we would never have
+  // gotten here with a real location...
+  if (return_loc_sp && return_loc_sp->IsFacade()) {
+    if (return_loc_sp->IsEnabled())
+      return_loc_sp->m_hit_counter.Increment();
+    else
+      return {};
+  }
+  return return_loc_sp;
+}
+
 // RETURNS - true if we should stop at this breakpoint, false if we
 // should continue.  Note, we don't check the thread spec for the breakpoint
 // here, since if the breakpoint is not for this thread, then the event won't
 // even get reported, so the check is redundant.
 
-bool BreakpointLocation::ShouldStop(StoppointCallbackContext *context) {
+bool BreakpointLocation::ShouldStop(StoppointCallbackContext *context,
+                                    lldb::BreakpointLocationSP &facade_loc_sp) {
   bool should_stop = true;
   Log *log = GetLog(LLDBLog::Breakpoints);
 
@@ -386,6 +426,27 @@ bool BreakpointLocation::ShouldStop(StoppointCallbackContext *context) {
   if (!IsEnabled())
     return false;
 
+  // Next check WasHit:
+  BreakpointLocationSP loc_hit_sp = WasHit(context);
+
+  if (!loc_hit_sp) {
+    // We bump the hit counts in StopInfoBreakpoint::ShouldStopSynchronous,
+    // before we call into each location's ShouldStop.  So we need to undo
+    // that here.
+    UndoBumpHitCount();
+    return false;
+  }
+
+  // If the location hit was not us, it was a facade location, in which case
+  // we should use the facade location's callbacks, etc.  Those will all be
+  // run in the asynchronous phase, so for now we just have to record the fact
+  // that we should treat this as a facade hit.  This is strictly an out
+  // parameter, so clear it if this isn't a facade hit.
+  if (loc_hit_sp.get() != this)
+    facade_loc_sp = loc_hit_sp;
+  else
+    facade_loc_sp.reset();
+
   // We only run synchronous callbacks in ShouldStop:
   context->is_synchronous = true;
   should_stop = InvokeCallback(context);
@@ -395,6 +456,11 @@ bool BreakpointLocation::ShouldStop(StoppointCallbackContext *context) {
     GetDescription(&s, lldb::eDescriptionLevelVerbose);
     LLDB_LOGF(log, "Hit breakpoint location: %s, %s.\n", s.GetData(),
               should_stop ? "stopping" : "continuing");
+    if (facade_loc_sp) {
+      s.Clear();
+      facade_loc_sp->GetDescription(&s, lldb::eDescriptionLevelVerbose);
+      LLDB_LOGF(log, "Attributing to facade location: %s.\n", s.GetData());
+    }
   }
 
   return should_stop;
@@ -417,7 +483,10 @@ void BreakpointLocation::UndoBumpHitCount() {
 }
 
 bool BreakpointLocation::IsResolved() const {
-  return m_bp_site_sp.get() != nullptr;
+
+  bool has_site = m_bp_site_sp.get() != nullptr;
+  // Facade locations are currently always considered resolved.
+  return has_site || IsFacade();
 }
 
 lldb::BreakpointSiteSP BreakpointLocation::GetBreakpointSite() const {
@@ -425,7 +494,9 @@ lldb::BreakpointSiteSP BreakpointLocation::GetBreakpointSite() const {
 }
 
 llvm::Error BreakpointLocation::ResolveBreakpointSite() {
-  if (m_bp_site_sp)
+  // This might be a facade location, which doesn't have an address.
+  // In that case, don't attempt to make a site.
+  if (m_bp_site_sp || IsFacade())
     return llvm::Error::success();
 
   Process *process = m_owner.GetTarget().GetProcessSP().get();
@@ -454,8 +525,12 @@ bool BreakpointLocation::SetBreakpointSite(BreakpointSiteSP &bp_site_sp) {
 }
 
 llvm::Error BreakpointLocation::ClearBreakpointSite() {
-  if (!m_bp_site_sp)
+  if (!m_bp_site_sp) {
+    // This might be a Facade Location, which don't have sites or addresses
+    if (IsFacade())
+      return llvm::Error::success();
     return llvm::createStringError("no breakpoint site to clear");
+  }
 
   // If the process exists, get it to remove the owner, it will remove the
   // physical implementation of the breakpoint as well if there are no more
@@ -474,6 +549,17 @@ void BreakpointLocation::GetDescription(Stream *s,
                                         lldb::DescriptionLevel level) {
   SymbolContext sc;
 
+  // If this is a scripted breakpoint, give it a chance to describe its
+  // locations:
+  std::optional<std::string> scripted_opt;
+  BreakpointResolverSP resolver_sp = GetBreakpoint().GetResolver();
+  BreakpointResolverScripted *scripted =
+      llvm::dyn_cast<BreakpointResolverScripted>(resolver_sp.get());
+  if (scripted)
+    scripted_opt = scripted->GetLocationDescription(shared_from_this(), level);
+
+  bool is_scripted_desc = scripted_opt.has_value();
+
   // If the description level is "initial" then the breakpoint is printing out
   // our initial state, and we should let it decide how it wants to print our
   // label.
@@ -491,7 +577,9 @@ void BreakpointLocation::GetDescription(Stream *s,
   if (level == lldb::eDescriptionLevelVerbose)
     s->IndentMore();
 
-  if (m_address.IsSectionOffset()) {
+  if (is_scripted_desc) {
+    s->PutCString(scripted_opt->c_str());
+  } else if (m_address.IsSectionOffset()) {
     m_address.CalculateSymbolContext(&sc);
 
     if (level == lldb::eDescriptionLevelFull ||
@@ -566,43 +654,51 @@ void BreakpointLocation::GetDescription(Stream *s,
     s->Indent();
   }
 
-  if (m_address.IsSectionOffset() &&
-      (level == eDescriptionLevelFull || level == eDescriptionLevelInitial))
-    s->Printf(", ");
-  s->Printf("address = ");
-
-  ExecutionContextScope *exe_scope = nullptr;
-  Target *target = &m_owner.GetTarget();
-  if (target)
-    exe_scope = target->GetProcessSP().get();
-  if (exe_scope == nullptr)
-    exe_scope = target;
-
-  if (level == eDescriptionLevelInitial)
-    m_address.Dump(s, exe_scope, Address::DumpStyleLoadAddress,
-                   Address::DumpStyleFileAddress);
-  else
-    m_address.Dump(s, exe_scope, Address::DumpStyleLoadAddress,
-                   Address::DumpStyleModuleWithFileAddress);
-
-  if (IsIndirect() && m_bp_site_sp) {
-    Address resolved_address;
-    resolved_address.SetLoadAddress(m_bp_site_sp->GetLoadAddress(), target);
-    Symbol *resolved_symbol = resolved_address.CalculateSymbolContextSymbol();
-    if (resolved_symbol) {
-      if (level == eDescriptionLevelFull || level == eDescriptionLevelInitial)
-        s->Printf(", ");
-      else if (level == lldb::eDescriptionLevelVerbose) {
-        s->EOL();
-        s->Indent();
+  if (!is_scripted_desc) {
+    if (m_address.IsSectionOffset() &&
+        (level == eDescriptionLevelFull || level == eDescriptionLevelInitial))
+      s->Printf(", ");
+    s->Printf("address = ");
+
+    ExecutionContextScope *exe_scope = nullptr;
+    Target *target = &m_owner.GetTarget();
+    if (target)
+      exe_scope = target->GetProcessSP().get();
+    if (exe_scope == nullptr)
+      exe_scope = target;
+
+    if (level == eDescriptionLevelInitial)
+      m_address.Dump(s, exe_scope, Address::DumpStyleLoadAddress,
+                     Address::DumpStyleFileAddress);
+    else
+      m_address.Dump(s, exe_scope, Address::DumpStyleLoadAddress,
+                     Address::DumpStyleModuleWithFileAddress);
+
+    if (IsIndirect() && m_bp_site_sp) {
+      Address resolved_address;
+      resolved_address.SetLoadAddress(m_bp_site_sp->GetLoadAddress(), target);
+      Symbol *resolved_symbol = resolved_address.CalculateSymbolContextSymbol();
+      if (resolved_symbol) {
+        if (level == eDescriptionLevelFull || level == eDescriptionLevelInitial)
+          s->Printf(", ");
+        else if (level == lldb::eDescriptionLevelVerbose) {
+          s->EOL();
+          s->Indent();
+        }
+        s->Printf("indirect target = %s",
+                  resolved_symbol->GetName().GetCString());
       }
-      s->Printf("indirect target = %s",
-                resolved_symbol->GetName().GetCString());
     }
   }
 
-  bool is_resolved = IsResolved();
-  bool is_hardware = is_resolved && m_bp_site_sp->IsHardware();
+  // FIXME: scripted breakpoint are currently always resolved.  Does this seem
+  // right? If they don't add any scripted locations, we shouldn't consider them
+  // resolved.
+  bool is_resolved = is_scripted_desc || IsResolved();
+  // A scripted breakpoint might be resolved but not have a site.  Be sure to
+  // check for that.
+  bool is_hardware = !is_scripted_desc && IsResolved() && m_bp_site_sp &&
+                     m_bp_site_sp->IsHardware();
 
   if (level == lldb::eDescriptionLevelVerbose) {
     s->EOL();
@@ -717,9 +813,9 @@ void BreakpointLocation::SwapLocation(BreakpointLocationSP swap_from) {
 }
 
 void BreakpointLocation::SetThreadIDInternal(lldb::tid_t thread_id) {
-  if (thread_id != LLDB_INVALID_THREAD_ID)
+  if (thread_id != LLDB_INVALID_THREAD_ID) {
     GetLocationOptions().SetThreadID(thread_id);
-  else {
+  } else {
     // If we're resetting this to an invalid thread id, then don't make an
     // options pointer just to do that.
     if (m_options_up != nullptr)
diff --git a/lldb/source/Breakpoint/BreakpointLocationCollection.cpp b/lldb/source/Breakpoint/BreakpointLocationCollection.cpp
index 81bec0bd7..1d052c5 100644
--- a/lldb/source/Breakpoint/BreakpointLocationCollection.cpp
+++ b/lldb/source/Breakpoint/BreakpointLocationCollection.cpp
@@ -115,7 +115,8 @@ BreakpointLocationCollection::GetByIndex(size_t i) const {
 }
 
 bool BreakpointLocationCollection::ShouldStop(
-    StoppointCallbackContext *context) {
+    StoppointCallbackContext *context,
+    BreakpointLocationCollection &stopped_bp_locs) {
   bool shouldStop = false;
   size_t i = 0;
   size_t prev_size = GetSize();
@@ -123,9 +124,20 @@ bool BreakpointLocationCollection::ShouldStop(
     // ShouldStop can remove the breakpoint from the list, or even delete
     // it, so we should
     BreakpointLocationSP cur_loc_sp = GetByIndex(i);
+    BreakpointLocationSP reported_loc_sp;
     BreakpointSP keep_bkpt_alive_sp = cur_loc_sp->GetBreakpoint().shared_from_this();
-    if (cur_loc_sp->ShouldStop(context))
+    // We're building up the list or which locations claim responsibility for
+    // this stop.  If the location's ShouldStop defers to a facade location by
+    // returning a non-null reported location, we want to use that.  Otherwise
+    // use the original location.
+    if (cur_loc_sp->ShouldStop(context, reported_loc_sp)) {
+      if (reported_loc_sp)
+        stopped_bp_locs.Add(reported_loc_sp);
+      else
+        stopped_bp_locs.Add(cur_loc_sp);
+
       shouldStop = true;
+    }
 
     if (prev_size == GetSize())
       i++;
diff --git a/lldb/source/Breakpoint/BreakpointLocationList.cpp b/lldb/source/Breakpoint/BreakpointLocationList.cpp
index 44d1eb5..ea431aa 100644
--- a/lldb/source/Breakpoint/BreakpointLocationList.cpp
+++ b/lldb/source/Breakpoint/BreakpointLocationList.cpp
@@ -41,13 +41,14 @@ BreakpointLocationList::Create(const Address &addr,
 }
 
 bool BreakpointLocationList::ShouldStop(StoppointCallbackContext *context,
-                                        lldb::break_id_t break_id) {
+                                        lldb::break_id_t break_id,
+                                        lldb::BreakpointLocationSP &bp_loc_sp) {
   BreakpointLocationSP bp = FindByID(break_id);
   if (bp) {
     // Let the BreakpointLocation decide if it should stop here (could not have
     // reached it's target hit count yet, or it could have a callback that
     // decided it shouldn't stop (shared library loads/unloads).
-    return bp->ShouldStop(context);
+    return bp->ShouldStop(context, bp_loc_sp);
   }
   // We should stop here since this BreakpointLocation isn't valid anymore or
   // it doesn't exist.
diff --git a/lldb/source/Breakpoint/BreakpointResolverScripted.cpp b/lldb/source/Breakpoint/BreakpointResolverScripted.cpp
index 701caba..373bd74 100644
--- a/lldb/source/Breakpoint/BreakpointResolverScripted.cpp
+++ b/lldb/source/Breakpoint/BreakpointResolverScripted.cpp
@@ -50,7 +50,9 @@ void BreakpointResolverScripted::CreateImplementationIfNeeded(
   if (!script_interp)
     return;
 
-  m_interface_sp = script_interp->CreateScriptedBreakpointInterface();
+  if (!m_interface_sp)
+    m_interface_sp = script_interp->CreateScriptedBreakpointInterface();
+
   if (!m_interface_sp) {
     m_error = Status::FromErrorStringWithFormat(
         "BreakpointResolverScripted::%s () - ERROR: %s", __FUNCTION__,
@@ -61,6 +63,7 @@ void BreakpointResolverScripted::CreateImplementationIfNeeded(
   auto obj_or_err =
       m_interface_sp->CreatePluginObject(m_class_name, breakpoint_sp, m_args);
   if (!obj_or_err) {
+    m_interface_sp.reset();
     m_error = Status::FromError(obj_or_err.takeError());
     return;
   }
@@ -146,6 +149,8 @@ void BreakpointResolverScripted::GetDescription(Stream *s) {
   StructuredData::GenericSP generic_sp;
   std::optional<std::string> short_help;
 
+  CreateImplementationIfNeeded(GetBreakpoint());
+
   if (m_interface_sp) {
     short_help = m_interface_sp->GetShortHelp();
   }
@@ -155,6 +160,22 @@ void BreakpointResolverScripted::GetDescription(Stream *s) {
     s->Printf("python class = %s", m_class_name.c_str());
 }
 
+std::optional<std::string> BreakpointResolverScripted::GetLocationDescription(
+    lldb::BreakpointLocationSP bp_loc_sp, lldb::DescriptionLevel level) {
+  CreateImplementationIfNeeded(GetBreakpoint());
+  if (m_interface_sp)
+    return m_interface_sp->GetLocationDescription(bp_loc_sp, level);
+  return {};
+}
+
+lldb::BreakpointLocationSP
+BreakpointResolverScripted::WasHit(lldb::StackFrameSP frame_sp,
+                                   lldb::BreakpointLocationSP bp_loc_sp) {
+  if (m_interface_sp)
+    return m_interface_sp->WasHit(frame_sp, bp_loc_sp);
+  return {};
+}
+
 void BreakpointResolverScripted::Dump(Stream *s) const {}
 
 lldb::BreakpointResolverSP
diff --git a/lldb/source/Breakpoint/BreakpointSite.cpp b/lldb/source/Breakpoint/BreakpointSite.cpp
index d430e3d..fd7666b 100644
--- a/lldb/source/Breakpoint/BreakpointSite.cpp
+++ b/lldb/source/Breakpoint/BreakpointSite.cpp
@@ -45,7 +45,9 @@ break_id_t BreakpointSite::GetNextID() {
 // RETURNS - true if we should stop at this breakpoint, false if we
 // should continue.
 
-bool BreakpointSite::ShouldStop(StoppointCallbackContext *context) {
+bool BreakpointSite::ShouldStop(
+    StoppointCallbackContext *context,
+    BreakpointLocationCollection &stopping_bp_locs) {
   m_hit_counter.Increment();
   // ShouldStop can do a lot of work, and might even come back and hit
   // this breakpoint site again.  So don't hold the m_constituents_mutex the
@@ -56,7 +58,7 @@ bool BreakpointSite::ShouldStop(StoppointCallbackContext *context) {
     std::lock_guard<std::recursive_mutex> guard(m_constituents_mutex);
     constituents_copy = m_constituents;
   }
-  return constituents_copy.ShouldStop(context);
+  return constituents_copy.ShouldStop(context, stopping_bp_locs);
 }
 
 bool BreakpointSite::IsBreakpointAtThisSite(lldb::break_id_t bp_id) {
diff --git a/lldb/source/Core/ModuleList.cpp b/lldb/source/Core/ModuleList.cpp
index 2ccebf3..c40612c 100644
--- a/lldb/source/Core/ModuleList.cpp
+++ b/lldb/source/Core/ModuleList.cpp
@@ -989,7 +989,7 @@ private:
 };
 
 struct SharedModuleListInfo {
-  ModuleList module_list;
+  SharedModuleList module_list;
   ModuleListProperties module_list_properties;
 };
 }
@@ -1007,7 +1007,7 @@ static SharedModuleListInfo &GetSharedModuleListInfo()
   return *g_shared_module_list_info;
 }
 
-static ModuleList &GetSharedModuleList() {
+static SharedModuleList &GetSharedModuleList() {
   return GetSharedModuleListInfo().module_list;
 }
 
@@ -1017,8 +1017,8 @@ ModuleListProperties &ModuleList::GetGlobalModuleListProperties() {
 
 bool ModuleList::ModuleIsInCache(const Module *module_ptr) {
   if (module_ptr) {
-    ModuleList &shared_module_list = GetSharedModuleList();
-    return shared_module_list.FindModule(module_ptr).get() != nullptr;
+    SharedModuleList &shared_module_list = GetSharedModuleList();
+    return shared_module_list.FindModule(*module_ptr).get() != nullptr;
   }
   return false;
 }
@@ -1041,9 +1041,8 @@ ModuleList::GetSharedModule(const ModuleSpec &module_spec, ModuleSP &module_sp,
                             const FileSpecList *module_search_paths_ptr,
                             llvm::SmallVectorImpl<lldb::ModuleSP> *old_modules,
                             bool *did_create_ptr, bool always_create) {
-  ModuleList &shared_module_list = GetSharedModuleList();
-  std::lock_guard<std::recursive_mutex> guard(
-      shared_module_list.m_modules_mutex);
+  SharedModuleList &shared_module_list = GetSharedModuleList();
+  std::lock_guard<std::recursive_mutex> guard(shared_module_list.GetMutex());
   char path[PATH_MAX];
 
   Status error;
diff --git a/lldb/source/Interpreter/ScriptInterpreter.cpp b/lldb/source/Interpreter/ScriptInterpreter.cpp
index 6a654a0..ca768db 100644
--- a/lldb/source/Interpreter/ScriptInterpreter.cpp
+++ b/lldb/source/Interpreter/ScriptInterpreter.cpp
@@ -81,6 +81,12 @@ lldb::BreakpointSP ScriptInterpreter::GetOpaqueTypeFromSBBreakpoint(
   return breakpoint.m_opaque_wp.lock();
 }
 
+lldb::BreakpointLocationSP
+ScriptInterpreter::GetOpaqueTypeFromSBBreakpointLocation(
+    const lldb::SBBreakpointLocation &break_loc) const {
+  return break_loc.m_opaque_wp.lock();
+}
+
 lldb::ProcessAttachInfoSP ScriptInterpreter::GetOpaqueTypeFromSBAttachInfo(
     const lldb::SBAttachInfo &attach_info) const {
   return attach_info.m_opaque_sp;
@@ -100,6 +106,13 @@ ScriptInterpreter::GetStatusFromSBError(const lldb::SBError &error) const {
   return Status();
 }
 
+lldb::StackFrameSP
+ScriptInterpreter::GetOpaqueTypeFromSBFrame(const lldb::SBFrame &frame) const {
+  if (frame.m_opaque_sp)
+    return frame.m_opaque_sp->GetFrameSP();
+  return nullptr;
+}
+
 Event *
 ScriptInterpreter::GetOpaqueTypeFromSBEvent(const lldb::SBEvent &event) const {
   return event.m_opaque_ptr;
diff --git a/lldb/source/Plugins/Platform/Android/PlatformAndroid.cpp b/lldb/source/Plugins/Platform/Android/PlatformAndroid.cpp
index 600cc0a..f538fc6 100644
--- a/lldb/source/Plugins/Platform/Android/PlatformAndroid.cpp
+++ b/lldb/source/Plugins/Platform/Android/PlatformAndroid.cpp
@@ -477,6 +477,249 @@ std::string PlatformAndroid::GetRunAs() {
   }
   return run_as.str();
 }
+
+// Helper function to populate process status information from
+// /proc/[pid]/status
+void PlatformAndroid::PopulateProcessStatusInfo(
+    lldb::pid_t pid, ProcessInstanceInfo &process_info) {
+  // Read /proc/[pid]/status to get parent PID, UIDs, and GIDs
+  Status error;
+  AdbClientUP status_adb = GetAdbClient(error);
+  if (error.Fail())
+    return;
+
+  std::string status_output;
+  StreamString status_cmd;
+  status_cmd.Printf(
+      "cat /proc/%llu/status 2>/dev/null | grep -E '^(PPid|Uid|Gid):'",
+      static_cast<unsigned long long>(pid));
+  Status status_error =
+      status_adb->Shell(status_cmd.GetData(), seconds(5), &status_output);
+
+  if (status_error.Fail() || status_output.empty())
+    return;
+
+  llvm::SmallVector<llvm::StringRef, 16> lines;
+  llvm::StringRef(status_output).split(lines, '\n');
+
+  for (llvm::StringRef line : lines) {
+    line = line.trim();
+    if (line.starts_with("PPid:")) {
+      llvm::StringRef ppid_str = line.substr(5).trim();
+      lldb::pid_t ppid;
+      if (llvm::to_integer(ppid_str, ppid))
+        process_info.SetParentProcessID(ppid);
+    } else if (line.starts_with("Uid:")) {
+      llvm::SmallVector<llvm::StringRef, 4> uid_parts;
+      line.substr(4).trim().split(uid_parts, '\t', -1, false);
+      if (uid_parts.size() >= 2) {
+        uint32_t uid, euid;
+        if (llvm::to_integer(uid_parts[0].trim(), uid))
+          process_info.SetUserID(uid);
+        if (llvm::to_integer(uid_parts[1].trim(), euid))
+          process_info.SetEffectiveUserID(euid);
+      }
+    } else if (line.starts_with("Gid:")) {
+      llvm::SmallVector<llvm::StringRef, 4> gid_parts;
+      line.substr(4).trim().split(gid_parts, '\t', -1, false);
+      if (gid_parts.size() >= 2) {
+        uint32_t gid, egid;
+        if (llvm::to_integer(gid_parts[0].trim(), gid))
+          process_info.SetGroupID(gid);
+        if (llvm::to_integer(gid_parts[1].trim(), egid))
+          process_info.SetEffectiveGroupID(egid);
+      }
+    }
+  }
+}
+
+// Helper function to populate command line arguments from /proc/[pid]/cmdline
+void PlatformAndroid::PopulateProcessCommandLine(
+    lldb::pid_t pid, ProcessInstanceInfo &process_info) {
+  // Read /proc/[pid]/cmdline to get command line arguments
+  Status error;
+  AdbClientUP cmdline_adb = GetAdbClient(error);
+  if (error.Fail())
+    return;
+
+  std::string cmdline_output;
+  StreamString cmdline_cmd;
+  cmdline_cmd.Printf("cat /proc/%llu/cmdline 2>/dev/null | tr '\\000' ' '",
+                     static_cast<unsigned long long>(pid));
+  Status cmdline_error =
+      cmdline_adb->Shell(cmdline_cmd.GetData(), seconds(5), &cmdline_output);
+
+  if (cmdline_error.Fail() || cmdline_output.empty())
+    return;
+
+  cmdline_output = llvm::StringRef(cmdline_output).trim().str();
+  if (cmdline_output.empty())
+    return;
+
+  llvm::SmallVector<llvm::StringRef, 16> args;
+  llvm::StringRef(cmdline_output).split(args, ' ', -1, false);
+  if (args.empty())
+    return;
+
+  process_info.SetArg0(args[0]);
+  Args process_args;
+  for (size_t i = 1; i < args.size(); i++) {
+    if (!args[i].empty())
+      process_args.AppendArgument(args[i]);
+  }
+  process_info.SetArguments(process_args, false);
+}
+
+// Helper function to populate architecture from /proc/[pid]/exe
+void PlatformAndroid::PopulateProcessArchitecture(
+    lldb::pid_t pid, ProcessInstanceInfo &process_info) {
+  // Read /proc/[pid]/exe to get executable path for architecture detection
+  Status error;
+  AdbClientUP exe_adb = GetAdbClient(error);
+  if (error.Fail())
+    return;
+
+  std::string exe_output;
+  StreamString exe_cmd;
+  exe_cmd.Printf("readlink /proc/%llu/exe 2>/dev/null",
+                 static_cast<unsigned long long>(pid));
+  Status exe_error = exe_adb->Shell(exe_cmd.GetData(), seconds(5), &exe_output);
+
+  if (exe_error.Fail() || exe_output.empty())
+    return;
+
+  exe_output = llvm::StringRef(exe_output).trim().str();
+
+  // Determine architecture from exe path
+  ArchSpec arch;
+  if (exe_output.find("64") != std::string::npos ||
+      exe_output.find("arm64") != std::string::npos ||
+      exe_output.find("aarch64") != std::string::npos) {
+    arch.SetTriple("aarch64-unknown-linux-android");
+  } else if (exe_output.find("x86_64") != std::string::npos) {
+    arch.SetTriple("x86_64-unknown-linux-android");
+  } else if (exe_output.find("x86") != std::string::npos ||
+             exe_output.find("i686") != std::string::npos) {
+    arch.SetTriple("i686-unknown-linux-android");
+  } else {
+    // Default to armv7 for 32-bit ARM (most common on Android)
+    arch.SetTriple("armv7-unknown-linux-android");
+  }
+
+  if (arch.IsValid())
+    process_info.SetArchitecture(arch);
+}
+
+uint32_t
+PlatformAndroid::FindProcesses(const ProcessInstanceInfoMatch &match_info,
+                               ProcessInstanceInfoList &proc_infos) {
+  proc_infos.clear();
+
+  // When LLDB is running natively on an Android device (IsHost() == true),
+  // use the parent class's standard Linux /proc enumeration. IsHost() is only
+  // true when compiled for Android (#if defined(__ANDROID__)), so calling
+  // PlatformLinux methods is safe (Android is Linux-based).
+  if (IsHost())
+    return PlatformLinux::FindProcesses(match_info, proc_infos);
+
+  // Remote Android platform: implement process name lookup using 'pidof' over
+  // adb.
+
+  // LLDB stores the search name in GetExecutableFile() (even though it's
+  // actually a process name like "com.android.chrome" rather than an
+  // executable path). If no search name is provided, we can't use
+  // 'pidof', so return early with no results.
+  const ProcessInstanceInfo &match_process_info = match_info.GetProcessInfo();
+  if (!match_process_info.GetExecutableFile() ||
+      match_info.GetNameMatchType() == NameMatch::Ignore) {
+    return 0;
+  }
+
+  // Extract the process name to search for (typically an Android package name
+  // like "com.example.app" or binary name like "app_process64")
+  std::string process_name = match_process_info.GetExecutableFile().GetPath();
+  if (process_name.empty())
+    return 0;
+
+  // Use adb to find the process by name
+  Status error;
+  AdbClientUP adb(GetAdbClient(error));
+  if (error.Fail()) {
+    Log *log = GetLog(LLDBLog::Platform);
+    LLDB_LOGF(log, "PlatformAndroid::%s failed to get ADB client: %s",
+              __FUNCTION__, error.AsCString());
+    return 0;
+  }
+
+  // Use 'pidof' command to get PIDs for the process name.
+  // Quote the process name to handle special characters (spaces, etc.)
+  std::string pidof_output;
+  StreamString command;
+  command.Printf("pidof '%s'", process_name.c_str());
+  error = adb->Shell(command.GetData(), seconds(5), &pidof_output);
+
+  if (error.Fail()) {
+    Log *log = GetLog(LLDBLog::Platform);
+    LLDB_LOG(log, "PlatformAndroid::{} 'pidof {}' failed: {}", __FUNCTION__,
+             process_name.c_str(), error.AsCString());
+    return 0;
+  }
+
+  // Parse PIDs from pidof output.
+  // Note: pidof can return multiple PIDs (space-separated) if multiple
+  // instances of the same executable are running.
+  pidof_output = llvm::StringRef(pidof_output).trim().str();
+  if (pidof_output.empty()) {
+    Log *log = GetLog(LLDBLog::Platform);
+    LLDB_LOGF(log, "PlatformAndroid::%s no process found with name '%s'",
+              __FUNCTION__, process_name.c_str());
+    return 0;
+  }
+
+  // Split the output by whitespace to handle multiple PIDs
+  llvm::SmallVector<llvm::StringRef, 8> pid_strings;
+  llvm::StringRef(pidof_output).split(pid_strings, ' ', -1, false);
+
+  Log *log = GetLog(LLDBLog::Platform);
+
+  // Process each PID and gather information
+  uint32_t num_matches = 0;
+  for (llvm::StringRef pid_str : pid_strings) {
+    pid_str = pid_str.trim();
+    if (pid_str.empty())
+      continue;
+
+    lldb::pid_t pid;
+    if (!llvm::to_integer(pid_str, pid)) {
+      LLDB_LOGF(log, "PlatformAndroid::%s failed to parse PID from: '%s'",
+                __FUNCTION__, pid_str.str().c_str());
+      continue;
+    }
+
+    ProcessInstanceInfo process_info;
+    process_info.SetProcessID(pid);
+    process_info.GetExecutableFile().SetFile(process_name,
+                                             FileSpec::Style::posix);
+
+    // Populate additional process information
+    PopulateProcessStatusInfo(pid, process_info);
+    PopulateProcessCommandLine(pid, process_info);
+    PopulateProcessArchitecture(pid, process_info);
+
+    // Check if this process matches the criteria
+    if (match_info.Matches(process_info)) {
+      proc_infos.push_back(process_info);
+      num_matches++;
+
+      LLDB_LOGF(log, "PlatformAndroid::%s found process '%s' with PID %llu",
+                __FUNCTION__, process_name.c_str(),
+                static_cast<unsigned long long>(pid));
+    }
+  }
+
+  return num_matches;
+}
+
 std::unique_ptr<AdbSyncService> PlatformAndroid::GetSyncService(Status &error) {
   auto sync_service = std::make_unique<AdbSyncService>(m_device_id);
   error = sync_service->SetupSyncConnection();
diff --git a/lldb/source/Plugins/Platform/Android/PlatformAndroid.h b/lldb/source/Plugins/Platform/Android/PlatformAndroid.h
index 3384525..e771c6a 100644
--- a/lldb/source/Plugins/Platform/Android/PlatformAndroid.h
+++ b/lldb/source/Plugins/Platform/Android/PlatformAndroid.h
@@ -59,6 +59,9 @@ public:
 
   uint32_t GetDefaultMemoryCacheLineSize() override;
 
+  uint32_t FindProcesses(const ProcessInstanceInfoMatch &match_info,
+                         ProcessInstanceInfoList &proc_infos) override;
+
 protected:
   const char *GetCacheHostname() override;
 
@@ -86,6 +89,14 @@ protected:
 private:
   std::string m_device_id;
   uint32_t m_sdk_version;
+
+  // Helper functions for process information gathering
+  void PopulateProcessStatusInfo(lldb::pid_t pid,
+                                 ProcessInstanceInfo &process_info);
+  void PopulateProcessCommandLine(lldb::pid_t pid,
+                                  ProcessInstanceInfo &process_info);
+  void PopulateProcessArchitecture(lldb::pid_t pid,
+                                   ProcessInstanceInfo &process_info);
 };
 
 } // namespace platform_android
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedBreakpointPythonInterface.cpp b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedBreakpointPythonInterface.cpp
index 660edaa..c9bb38d 100644
--- a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedBreakpointPythonInterface.cpp
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedBreakpointPythonInterface.cpp
@@ -81,6 +81,32 @@ std::optional<std::string> ScriptedBreakpointPythonInterface::GetShortHelp() {
   return obj->GetAsString()->GetValue().str();
 }
 
+lldb::BreakpointLocationSP ScriptedBreakpointPythonInterface::WasHit(
+    lldb::StackFrameSP frame_sp, lldb::BreakpointLocationSP bp_loc_sp) {
+  Status py_error;
+  lldb::BreakpointLocationSP loc_sp = Dispatch<lldb::BreakpointLocationSP>(
+      "was_hit", py_error, frame_sp, bp_loc_sp);
+
+  if (py_error.Fail())
+    return bp_loc_sp;
+
+  return loc_sp;
+}
+
+std::optional<std::string>
+ScriptedBreakpointPythonInterface::GetLocationDescription(
+    lldb::BreakpointLocationSP bp_loc_sp, lldb::DescriptionLevel level) {
+  Status error;
+  StructuredData::ObjectSP obj =
+      Dispatch("get_location_description", error, bp_loc_sp, level);
+
+  if (!ScriptedInterface::CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, obj,
+                                                    error))
+    return {};
+
+  return obj->GetAsString()->GetValue().str();
+}
+
 void ScriptedBreakpointPythonInterface::Initialize() {
   const std::vector<llvm::StringRef> ci_usages = {
       "breakpoint set -P classname [-k key -v value ...]"};
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedBreakpointPythonInterface.h b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedBreakpointPythonInterface.h
index 27bdd871..72da0a1 100644
--- a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedBreakpointPythonInterface.h
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedBreakpointPythonInterface.h
@@ -36,6 +36,12 @@ public:
   bool ResolverCallback(SymbolContext sym_ctx) override;
   lldb::SearchDepth GetDepth() override;
   std::optional<std::string> GetShortHelp() override;
+  lldb::BreakpointLocationSP
+  WasHit(lldb::StackFrameSP frame_sp,
+         lldb::BreakpointLocationSP bp_loc_sp) override;
+  virtual std::optional<std::string>
+  GetLocationDescription(lldb::BreakpointLocationSP bp_loc_sp,
+                         lldb::DescriptionLevel level) override;
 
   static void Initialize();
 
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPythonInterface.cpp b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPythonInterface.cpp
index 8083cca..4fdf2b1 100644
--- a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPythonInterface.cpp
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPythonInterface.cpp
@@ -81,6 +81,19 @@ ScriptedPythonInterface::ExtractValueFromPythonObject<lldb::StreamSP>(
 }
 
 template <>
+lldb::StackFrameSP
+ScriptedPythonInterface::ExtractValueFromPythonObject<lldb::StackFrameSP>(
+    python::PythonObject &p, Status &error) {
+  if (lldb::SBFrame *sb_frame = reinterpret_cast<lldb::SBFrame *>(
+          python::LLDBSWIGPython_CastPyObjectToSBFrame(p.get())))
+    return m_interpreter.GetOpaqueTypeFromSBFrame(*sb_frame);
+  error = Status::FromErrorString(
+      "Couldn't cast lldb::SBFrame to lldb_private::StackFrame.");
+
+  return nullptr;
+}
+
+template <>
 SymbolContext
 ScriptedPythonInterface::ExtractValueFromPythonObject<SymbolContext>(
     python::PythonObject &p, Status &error) {
@@ -127,6 +140,24 @@ ScriptedPythonInterface::ExtractValueFromPythonObject<lldb::BreakpointSP>(
 }
 
 template <>
+lldb::BreakpointLocationSP
+ScriptedPythonInterface::ExtractValueFromPythonObject<
+    lldb::BreakpointLocationSP>(python::PythonObject &p, Status &error) {
+  lldb::SBBreakpointLocation *sb_break_loc =
+      reinterpret_cast<lldb::SBBreakpointLocation *>(
+          python::LLDBSWIGPython_CastPyObjectToSBBreakpointLocation(p.get()));
+
+  if (!sb_break_loc) {
+    error = Status::FromErrorStringWithFormat(
+        "Couldn't cast lldb::SBBreakpointLocation to "
+        "lldb::BreakpointLocationSP.");
+    return nullptr;
+  }
+
+  return m_interpreter.GetOpaqueTypeFromSBBreakpointLocation(*sb_break_loc);
+}
+
+template <>
 lldb::ProcessAttachInfoSP ScriptedPythonInterface::ExtractValueFromPythonObject<
     lldb::ProcessAttachInfoSP>(python::PythonObject &p, Status &error) {
   lldb::SBAttachInfo *sb_attach_info = reinterpret_cast<lldb::SBAttachInfo *>(
@@ -194,4 +225,22 @@ ScriptedPythonInterface::ExtractValueFromPythonObject<
   return m_interpreter.GetOpaqueTypeFromSBExecutionContext(*sb_exe_ctx);
 }
 
+template <>
+lldb::DescriptionLevel
+ScriptedPythonInterface::ExtractValueFromPythonObject<lldb::DescriptionLevel>(
+    python::PythonObject &p, Status &error) {
+  lldb::DescriptionLevel ret_val = lldb::eDescriptionLevelBrief;
+  llvm::Expected<unsigned long long> unsigned_or_err = p.AsUnsignedLongLong();
+  if (!unsigned_or_err) {
+    error = (Status::FromError(unsigned_or_err.takeError()));
+    return ret_val;
+  }
+  unsigned long long unsigned_val = *unsigned_or_err;
+  if (unsigned_val >= lldb::DescriptionLevel::kNumDescriptionLevels) {
+    error = Status("value too large for lldb::DescriptionLevel.");
+    return ret_val;
+  }
+  return static_cast<lldb::DescriptionLevel>(unsigned_val);
+}
+
 #endif
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPythonInterface.h b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPythonInterface.h
index f769d3d..2335b2e 100644
--- a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPythonInterface.h
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPythonInterface.h
@@ -436,6 +436,10 @@ protected:
     return python::SWIGBridge::ToSWIGWrapper(arg);
   }
 
+  python::PythonObject Transform(lldb::BreakpointLocationSP arg) {
+    return python::SWIGBridge::ToSWIGWrapper(arg);
+  }
+
   python::PythonObject Transform(lldb::ProcessSP arg) {
     return python::SWIGBridge::ToSWIGWrapper(arg);
   }
@@ -464,10 +468,18 @@ protected:
     return python::SWIGBridge::ToSWIGWrapper(arg.get());
   }
 
+  python::PythonObject Transform(lldb::StackFrameSP arg) {
+    return python::SWIGBridge::ToSWIGWrapper(arg);
+  }
+
   python::PythonObject Transform(lldb::DataExtractorSP arg) {
     return python::SWIGBridge::ToSWIGWrapper(arg);
   }
 
+  python::PythonObject Transform(lldb::DescriptionLevel arg) {
+    return python::SWIGBridge::ToSWIGWrapper(arg);
+  }
+
   template <typename T, typename U>
   void ReverseTransform(T &original_arg, U transformed_arg, Status &error) {
     // If U is not a PythonObject, don't touch it!
@@ -574,11 +586,21 @@ ScriptedPythonInterface::ExtractValueFromPythonObject<lldb::StreamSP>(
     python::PythonObject &p, Status &error);
 
 template <>
+lldb::StackFrameSP
+ScriptedPythonInterface::ExtractValueFromPythonObject<lldb::StackFrameSP>(
+    python::PythonObject &p, Status &error);
+
+template <>
 lldb::BreakpointSP
 ScriptedPythonInterface::ExtractValueFromPythonObject<lldb::BreakpointSP>(
     python::PythonObject &p, Status &error);
 
 template <>
+lldb::BreakpointLocationSP
+ScriptedPythonInterface::ExtractValueFromPythonObject<
+    lldb::BreakpointLocationSP>(python::PythonObject &p, Status &error);
+
+template <>
 lldb::ProcessAttachInfoSP ScriptedPythonInterface::ExtractValueFromPythonObject<
     lldb::ProcessAttachInfoSP>(python::PythonObject &p, Status &error);
 
@@ -601,6 +623,11 @@ lldb::ExecutionContextRefSP
 ScriptedPythonInterface::ExtractValueFromPythonObject<
     lldb::ExecutionContextRefSP>(python::PythonObject &p, Status &error);
 
+template <>
+lldb::DescriptionLevel
+ScriptedPythonInterface::ExtractValueFromPythonObject<lldb::DescriptionLevel>(
+    python::PythonObject &p, Status &error);
+
 } // namespace lldb_private
 
 #endif // LLDB_ENABLE_PYTHON
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/SWIGPythonBridge.h b/lldb/source/Plugins/ScriptInterpreter/Python/SWIGPythonBridge.h
index 4137786..7b39d29 100644
--- a/lldb/source/Plugins/ScriptInterpreter/Python/SWIGPythonBridge.h
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/SWIGPythonBridge.h
@@ -107,6 +107,7 @@ public:
   static PythonObject ToSWIGWrapper(lldb::ProcessAttachInfoSP attach_info_sp);
   static PythonObject ToSWIGWrapper(lldb::ProcessLaunchInfoSP launch_info_sp);
   static PythonObject ToSWIGWrapper(lldb::DataExtractorSP data_extractor_sp);
+  static PythonObject ToSWIGWrapper(lldb::DescriptionLevel level);
 
   static PythonObject
   ToSWIGWrapper(std::unique_ptr<lldb::SBStructuredData> data_sb);
@@ -256,11 +257,13 @@ public:
 
 void *LLDBSWIGPython_CastPyObjectToSBData(PyObject *data);
 void *LLDBSWIGPython_CastPyObjectToSBBreakpoint(PyObject *data);
+void *LLDBSWIGPython_CastPyObjectToSBBreakpointLocation(PyObject *data);
 void *LLDBSWIGPython_CastPyObjectToSBAttachInfo(PyObject *data);
 void *LLDBSWIGPython_CastPyObjectToSBLaunchInfo(PyObject *data);
 void *LLDBSWIGPython_CastPyObjectToSBError(PyObject *data);
 void *LLDBSWIGPython_CastPyObjectToSBEvent(PyObject *data);
 void *LLDBSWIGPython_CastPyObjectToSBStream(PyObject *data);
+void *LLDBSWIGPython_CastPyObjectToSBFrame(PyObject *data);
 void *LLDBSWIGPython_CastPyObjectToSBSymbolContext(PyObject *data);
 void *LLDBSWIGPython_CastPyObjectToSBValue(PyObject *data);
 void *LLDBSWIGPython_CastPyObjectToSBMemoryRegionInfo(PyObject *data);
diff --git a/lldb/source/Symbol/Function.cpp b/lldb/source/Symbol/Function.cpp
index 6114ecc..2be1e389 100644
--- a/lldb/source/Symbol/Function.cpp
+++ b/lldb/source/Symbol/Function.cpp
@@ -275,7 +275,7 @@ Function::~Function() = default;
 void Function::GetStartLineSourceInfo(SupportFileSP &source_file_sp,
                                       uint32_t &line_no) {
   line_no = 0;
-  source_file_sp.reset();
+  source_file_sp = std::make_shared<SupportFile>();
 
   if (m_comp_unit == nullptr)
     return;
diff --git a/lldb/source/Target/StopInfo.cpp b/lldb/source/Target/StopInfo.cpp
index f47dae2..7fa1fc5 100644
--- a/lldb/source/Target/StopInfo.cpp
+++ b/lldb/source/Target/StopInfo.cpp
@@ -157,7 +157,8 @@ public:
           ExecutionContext exe_ctx(thread_sp->GetStackFrameAtIndex(0));
           StoppointCallbackContext context(event_ptr, exe_ctx, true);
           bp_site_sp->BumpHitCounts();
-          m_should_stop = bp_site_sp->ShouldStop(&context);
+          m_should_stop =
+              bp_site_sp->ShouldStop(&context, m_async_stopped_locs);
         } else {
           Log *log = GetLog(LLDBLog::Process);
 
@@ -180,6 +181,7 @@ public:
   }
 
   const char *GetDescription() override {
+    // FIXME: only print m_async_stopped_locs.
     if (m_description.empty()) {
       ThreadSP thread_sp(m_thread_wp.lock());
       if (thread_sp) {
@@ -202,7 +204,7 @@ public:
           }
 
           strm.Printf("breakpoint ");
-          bp_site_sp->GetDescription(&strm, eDescriptionLevelBrief);
+          m_async_stopped_locs.GetDescription(&strm, eDescriptionLevelBrief);
           m_description = std::string(strm.GetString());
         } else {
           StreamString strm;
@@ -244,6 +246,12 @@ public:
   }
 
   uint32_t GetStopReasonDataCount() const override {
+    size_t num_async_locs = m_async_stopped_locs.GetSize();
+    // If we have async locations, they are the ones we should report:
+    if (num_async_locs > 0)
+      return num_async_locs * 2;
+
+    // Otherwise report the number of locations at this breakpoint's site.
     lldb::BreakpointSiteSP bp_site_sp = GetBreakpointSiteSP();
     if (bp_site_sp)
       return bp_site_sp->GetNumberOfConstituents() * 2;
@@ -251,22 +259,25 @@ public:
   }
 
   uint64_t GetStopReasonDataAtIndex(uint32_t idx) override {
-    lldb::BreakpointSiteSP bp_site_sp = GetBreakpointSiteSP();
-    if (bp_site_sp) {
-      uint32_t bp_index = idx / 2;
-      BreakpointLocationSP bp_loc_sp(
-          bp_site_sp->GetConstituentAtIndex(bp_index));
-      if (bp_loc_sp) {
-        if (idx & 1) {
-          // FIXME: This might be a Facade breakpoint, so we need to fetch
-          // the one that the thread actually hit, not the native loc ID.
-
-          // Odd idx, return the breakpoint location ID
-          return bp_loc_sp->GetID();
-        } else {
-          // Even idx, return the breakpoint ID
-          return bp_loc_sp->GetBreakpoint().GetID();
-        }
+    uint32_t bp_index = idx / 2;
+    BreakpointLocationSP loc_to_report_sp;
+
+    size_t num_async_locs = m_async_stopped_locs.GetSize();
+    if (num_async_locs > 0) {
+      // GetByIndex returns an empty SP if we ask past its contents:
+      loc_to_report_sp = m_async_stopped_locs.GetByIndex(bp_index);
+    } else {
+      lldb::BreakpointSiteSP bp_site_sp = GetBreakpointSiteSP();
+      if (bp_site_sp)
+        loc_to_report_sp = bp_site_sp->GetConstituentAtIndex(bp_index);
+    }
+    if (loc_to_report_sp) {
+      if (idx & 1) {
+        // Odd idx, return the breakpoint location ID
+        return loc_to_report_sp->GetID();
+      } else {
+        // Even idx, return the breakpoint ID
+        return loc_to_report_sp->GetBreakpoint().GetID();
       }
     }
     return LLDB_INVALID_BREAK_ID;
@@ -335,8 +346,7 @@ protected:
         // local list.  That way if one of the breakpoint actions changes the
         // site, then we won't be operating on a bad list.
         BreakpointLocationCollection site_locations;
-        size_t num_constituents =
-            bp_site_sp->CopyConstituentsList(site_locations);
+        size_t num_constituents = m_async_stopped_locs.GetSize();
 
         if (num_constituents == 0) {
           m_should_stop = true;
@@ -436,16 +446,26 @@ protected:
           // I'm just sticking the BreakpointSP's in a vector since I'm only
           // using it to locally increment their retain counts.
 
+          // We are holding onto the breakpoint locations that were hit
+          // by this stop info between the "synchonous" ShouldStop and now.
+          // But an intervening action might have deleted one of the breakpoints
+          // we hit before we get here.  So at the same time let's build a list
+          // of the still valid locations:
           std::vector<lldb::BreakpointSP> location_constituents;
 
+          BreakpointLocationCollection valid_locs;
           for (size_t j = 0; j < num_constituents; j++) {
-            BreakpointLocationSP loc(site_locations.GetByIndex(j));
-            location_constituents.push_back(
-                loc->GetBreakpoint().shared_from_this());
+            BreakpointLocationSP loc_sp(m_async_stopped_locs.GetByIndex(j));
+            if (loc_sp->IsValid()) {
+              location_constituents.push_back(
+                  loc_sp->GetBreakpoint().shared_from_this());
+              valid_locs.Add(loc_sp);
+            }
           }
 
-          for (size_t j = 0; j < num_constituents; j++) {
-            lldb::BreakpointLocationSP bp_loc_sp = site_locations.GetByIndex(j);
+          size_t num_valid_locs = valid_locs.GetSize();
+          for (size_t j = 0; j < num_valid_locs; j++) {
+            lldb::BreakpointLocationSP bp_loc_sp = valid_locs.GetByIndex(j);
             StreamString loc_desc;
             if (log) {
               bp_loc_sp->GetDescription(&loc_desc, eDescriptionLevelBrief);
@@ -679,6 +699,7 @@ private:
   lldb::break_id_t m_break_id;
   bool m_was_all_internal;
   bool m_was_one_shot;
+  BreakpointLocationCollection m_async_stopped_locs;
 };
 
 // StopInfoWatchpoint
diff --git a/lldb/test/API/functionalities/breakpoint/scripted_bkpt/resolver.py b/lldb/test/API/functionalities/breakpoint/scripted_bkpt/resolver.py
index 85c73401..9586848 100644
--- a/lldb/test/API/functionalities/breakpoint/scripted_bkpt/resolver.py
+++ b/lldb/test/API/functionalities/breakpoint/scripted_bkpt/resolver.py
@@ -51,7 +51,6 @@ class Resolver:
     def get_short_help(self):
         return "I am a python breakpoint resolver"
 
-
 class ResolverModuleDepth(Resolver):
     def __get_depth__(self):
         return lldb.eSearchDepthModule
diff --git a/lldb/test/API/functionalities/breakpoint/scripted_bkpt/was_hit/Makefile b/lldb/test/API/functionalities/breakpoint/scripted_bkpt/was_hit/Makefile
new file mode 100644
index 0000000..695335e
--- /dev/null
+++ b/lldb/test/API/functionalities/breakpoint/scripted_bkpt/was_hit/Makefile
@@ -0,0 +1,4 @@
+C_SOURCES := main.c
+CFLAGS_EXTRAS := -std=c99
+
+include Makefile.rules
diff --git a/lldb/test/API/functionalities/breakpoint/scripted_bkpt/was_hit/TestWasHit.py b/lldb/test/API/functionalities/breakpoint/scripted_bkpt/was_hit/TestWasHit.py
new file mode 100644
index 0000000..2e17623
--- /dev/null
+++ b/lldb/test/API/functionalities/breakpoint/scripted_bkpt/was_hit/TestWasHit.py
@@ -0,0 +1,102 @@
+"""
+Test the WasHit feature of scripted breakpoints
+"""
+
+import os
+import lldb
+import lldbsuite.test.lldbutil as lldbutil
+from lldbsuite.test.decorators import *
+from lldbsuite.test.lldbtest import *
+
+
+class TestWasHit(TestBase):
+    NO_DEBUG_INFO_TESTCASE = True
+
+    @expectedFailureAll(oslist=["windows"], bugnumber="llvm.org/pr24528")
+    def test_was_hit_resolver(self):
+        """Use facade breakpoints to emulate hitting some locations"""
+        self.build()
+        self.do_test()
+
+    def make_target_and_import(self):
+        target = lldbutil.run_to_breakpoint_make_target(self)
+        self.import_resolver_script()
+        return target
+
+    def import_resolver_script(self):
+        interp = self.dbg.GetCommandInterpreter()
+        error = lldb.SBError()
+
+        script_name = os.path.join(self.getSourceDir(), "bkpt_resolver.py")
+
+        command = "command script import " + script_name
+        self.runCmd(command)
+
+    def make_extra_args(self, sym_name, num_locs, loc_to_miss):
+        return f" -k symbol -v {sym_name} -k num_locs -v {num_locs} -k loc_to_miss -v {loc_to_miss} "
+
+    def do_test(self):
+        """This reads in a python file and sets a breakpoint using it."""
+
+        target = self.make_target_and_import()
+        extra_args = self.make_extra_args("stop_symbol", 4, 2)
+
+        bkpt_no = lldbutil.run_break_set_by_script(
+            self, "bkpt_resolver.FacadeExample", extra_args, 4
+        )
+
+        # Make sure the help text shows up in the "break list" output:
+        self.expect(
+            "break list",
+            substrs=["I am a facade resolver - sym: stop_symbol - num_locs: 4"],
+            msg="Help is listed in break list",
+        )
+
+        bkpt = target.FindBreakpointByID(bkpt_no)
+        self.assertTrue(bkpt.IsValid(), "Found the right breakpoint")
+
+        # Now continue.  We should hit locations 1, 3 and 4:
+        (target, process, thread, bkpt) = lldbutil.run_to_breakpoint_do_run(
+            self, target, bkpt
+        )
+        # This location should be bkpt_no.1:
+        self.assertEqual(
+            thread.stop_reason_data[0], bkpt_no, "Hit the right breakpoint"
+        )
+        self.assertEqual(thread.stop_reason_data[1], 1, "First location hit is 1")
+
+        for loc in [3, 4]:
+            process.Continue()
+            self.assertEqual(
+                thread.stop_reason, lldb.eStopReasonBreakpoint, "Hit breakpoint"
+            )
+            self.assertEqual(
+                thread.stop_reason_data[0], bkpt_no, "Hit the right breakpoint"
+            )
+            self.assertEqual(
+                thread.stop_reason_data[1], loc, f"Hit the right location: {loc}"
+            )
+
+        # At this point we should have hit three of the four locations, and not location 1.2.
+        # Check that that is true, and that the descriptions for the location are the ones
+        # the resolver provided.
+        self.assertEqual(bkpt.hit_count, 3, "Hit three locations")
+        for loc_id in range(1, 4):
+            bkpt_loc = bkpt.FindLocationByID(loc_id)
+            self.assertTrue(bkpt_loc.IsValid(), f"{loc_id} was invalid.")
+            if loc_id != 2:
+                self.assertEqual(
+                    bkpt_loc.hit_count, 1, f"Loc {loc_id} hit count was wrong"
+                )
+            else:
+                self.assertEqual(bkpt_loc.hit_count, 0, "We didn't skip loc 2")
+            stream = lldb.SBStream()
+            self.assertTrue(
+                bkpt_loc.GetDescription(stream, lldb.eDescriptionLevelFull),
+                f"Didn't get description for {loc_id}",
+            )
+            self.assertIn(
+                f"Location index: {loc_id}",
+                stream.GetData(),
+                f"Wrong desciption for {loc_id}",
+            )
diff --git a/lldb/test/API/functionalities/breakpoint/scripted_bkpt/was_hit/bkpt_resolver.py b/lldb/test/API/functionalities/breakpoint/scripted_bkpt/was_hit/bkpt_resolver.py
new file mode 100644
index 0000000..acc8513
--- /dev/null
+++ b/lldb/test/API/functionalities/breakpoint/scripted_bkpt/was_hit/bkpt_resolver.py
@@ -0,0 +1,49 @@
+import lldb
+
+
+class FacadeExample:
+    def __init__(self, bkpt, extra_args, dict):
+        self.bkpt = bkpt
+        self.extra_args = extra_args
+        self.base_sym = None
+        self.facade_locs = []
+        self.facade_locs_desc = []
+        self.cur_facade_loc = 1
+
+        self.sym_name = extra_args.GetValueForKey("symbol").GetStringValue(100)
+        self.num_locs = extra_args.GetValueForKey("num_locs").GetIntegerValue(5)
+        self.loc_to_miss = extra_args.GetValueForKey("loc_to_miss").GetIntegerValue(
+            10000
+        )
+
+    def __callback__(self, sym_ctx):
+        self.base_sym = sym_ctx.module.FindSymbol(self.sym_name, lldb.eSymbolTypeCode)
+        if self.base_sym.IsValid():
+            self.bkpt.AddLocation(self.base_sym.GetStartAddress())
+            # Locations are 1 based, so to keep things simple, I'm making
+            # the array holding locations 1 based as well:
+            self.facade_locs_desc.append(
+                "This is the zero index, you shouldn't see this"
+            )
+            self.facade_locs.append(None)
+            for i in range(1, self.num_locs + 1):
+                self.facade_locs_desc.append(f"Location index: {i}")
+                self.facade_locs.append(self.bkpt.AddFacadeLocation())
+
+    def get_short_help(self):
+        return f"I am a facade resolver - sym: {self.sym_name} - num_locs: {self.num_locs} - locs_to_miss: {self.loc_to_miss}"
+
+    def was_hit(self, frame, bp_loc):
+        tmp_loc = self.cur_facade_loc
+
+        self.cur_facade_loc = self.cur_facade_loc + 1
+        if self.cur_facade_loc == self.num_locs + 1:
+            self.cur_facade_loc = 1
+
+        if tmp_loc == self.loc_to_miss:
+            return None
+
+        return self.facade_locs[tmp_loc]
+
+    def get_location_description(self, bp_loc, desc_level):
+        return self.facade_locs_desc[bp_loc.id]
diff --git a/lldb/test/API/functionalities/breakpoint/scripted_bkpt/was_hit/main.c b/lldb/test/API/functionalities/breakpoint/scripted_bkpt/was_hit/main.c
new file mode 100644
index 0000000..b8f977e
--- /dev/null
+++ b/lldb/test/API/functionalities/breakpoint/scripted_bkpt/was_hit/main.c
@@ -0,0 +1,14 @@
+#include <stdio.h>
+
+int stop_symbol() {
+  static int s_cnt = 0;
+  printf("I am in the stop symbol: %d\n", s_cnt++);
+  return s_cnt;
+}
+
+int main() {
+  for (int i = 0; i < 100; i++) {
+    stop_symbol();
+  }
+  return 0;
+}
diff --git a/lldb/test/API/functionalities/unwind/cortex-m-exception/TestCortexMExceptionUnwind.py b/lldb/test/API/functionalities/unwind/cortex-m-exception/TestCortexMExceptionUnwind.py
index 768dd6f..ebba4d1 100644
--- a/lldb/test/API/functionalities/unwind/cortex-m-exception/TestCortexMExceptionUnwind.py
+++ b/lldb/test/API/functionalities/unwind/cortex-m-exception/TestCortexMExceptionUnwind.py
@@ -12,20 +12,6 @@ from lldbsuite.test import lldbutil
 class TestCortexMExceptionUnwind(TestBase):
     NO_DEBUG_INFO_TESTCASE = True
 
-    # on the lldb-remote-linux-ubuntu CI, the binary.json's triple of
-    # armv7m-apple is not being set in the Target triple, and we're
-    # picking the wrong ABI plugin, ABISysV_arm.
-    # ABISysV_arm::CreateDefaultUnwindPlan() doesn't have a way to detect
-    # arm/thumb for a stack frame, or even the Target's triple for a
-    # Cortex-M part that is always thumb.  It hardcodes r11 as the frame
-    # pointer register, which is correct for arm code but not thumb.
-    # It is never correct # on a Cortex-M target.
-    # The Darwin ABIMacOSX_arm diverges from AAPCS and always uses r7 for
-    # the frame pointer -- the thumb convention -- whether executing arm or
-    # thumb.  So its CreateDefaultUnwindPlan picks the correct register for
-    # the frame pointer, and we can walk the stack.
-    # ABISysV_arm::CreateDefaultUnwindPlan will only get one frame and
-    # not be able to continue.
     @skipIfRemote
     def test_no_fpu(self):
         """Test that we can backtrace correctly through an ARM Cortex-M Exception return stack"""
@@ -59,10 +45,9 @@ class TestCortexMExceptionUnwind(TestBase):
         # frames above that.  The topmost two stack frames
         # were not interesting for this test, so I didn't
         # create symbols for them.
-        self.assertEqual(thread.GetNumFrames(), 6)
+        self.assertEqual(thread.GetNumFrames(), 3)
         stackframe_names = [
             "exception_catcher",
-            "exception_catcher",
             "exception_thrower",
             "main",
         ]
diff --git a/lldb/test/API/functionalities/unwind/cortex-m-exception/armv7m-nofpu-exception.yaml b/lldb/test/API/functionalities/unwind/cortex-m-exception/armv7m-nofpu-exception.yaml
index 9ce5ff4..0b4e1f8 100644
--- a/lldb/test/API/functionalities/unwind/cortex-m-exception/armv7m-nofpu-exception.yaml
+++ b/lldb/test/API/functionalities/unwind/cortex-m-exception/armv7m-nofpu-exception.yaml
@@ -2,8 +2,8 @@ cpu: armv7m
 threads:
   - regsets:
       - flavor: gpr
-        registers: [{name: sp, value: 0x2000fe70}, {name: r7, value: 0x2000fe80}, 
-                    {name: pc, value: 0x0020392c}, {name: lr, value: 0x0020392d}]
+        registers: [{name: sp, value: 0x2000fe88}, {name: r7, value: 0x2000fe88}, 
+                    {name: pc, value: 0x00203916}, {name: lr, value: 0x0020392d}]
 memory-regions:
   # stack memory fetched via 
   # (lldb) p/x $sp
@@ -14,7 +14,7 @@ memory-regions:
       0x0000002a, 0x20010e58, 0x00203923, 0x00000001,
       0x2000fe88, 0x00203911, 0x2000ffdc, 0xfffffff9,
       0x00000102, 0x00000002, 0x000003f0, 0x0000002a,
-      0x20012620, 0x00203215, 0x00203366, 0x81000200,
+      0x20012620, 0x00203215, 0x00202a92, 0x81000200,
       0x00203215, 0x200128b0, 0x0024928d, 0x2000fecc,
       0x002491ed, 0x20010e58, 0x20010e4c, 0x2000ffa0,
       0x200107a0, 0x0000003c, 0x200116e8, 0x200108b0,
@@ -62,3 +62,26 @@ memory-regions:
       0x98, 0xae, 0x28, 0x00
     ]
 
+  # exception_thrower
+  # (lldb) disass -b -c 12 -n exception_thrower
+  # 0x202a88 <+0>:  0xb5f0       push   {r4, r5, r6, r7, lr}
+  # 0x202a8a <+2>:  0xaf03       add    r7, sp, #0xc
+  # 0x202a8c <+4>:  0xe92d0f00   push.w {r8, r9, r10, r11}
+  # 0x202a90 <+8>:  0xb0c3       sub    sp, #0x10c
+  # 0x202a92 <+10>: 0xf7ffffd9   bl     0x202a48                 
+  - addr: 0x202a88
+    UInt8: [
+      0xf0, 0xb5, 0x03, 0xaf, 0x2d, 0xe9, 0x00, 0x0f,
+      0xc3, 0xb0, 0xff, 0xf7, 0xd9, 0xff, 0xff, 0xf7
+    ]
+
+  # main:
+  # 0x202a7e <+0>: push   {r7, lr}
+  # 0x202a80 <+2>: mov    r7, sp
+  # 0x202a82 <+4>: bl     0x202a88                  ; exception_thrower
+  # 0x202a86 <+8>: nop
+  - addr: 0x202a7e
+    UInt8: [
+      0x80, 0xb5, 0x6f, 0x46, 0x00, 0xf0, 0x01, 0xf8,
+      0x00, 0xbf
+    ]
diff --git a/lldb/test/API/functionalities/unwind/cortex-m-exception/binary.json b/lldb/test/API/functionalities/unwind/cortex-m-exception/binary.json
index 8fcd530..0de0169 100644
--- a/lldb/test/API/functionalities/unwind/cortex-m-exception/binary.json
+++ b/lldb/test/API/functionalities/unwind/cortex-m-exception/binary.json
@@ -1,5 +1,5 @@
 {
-  "triple": "armv7m-apple",
+  "triple": "armv7m--",
   "uuid": "2D157DBA-53C9-3AC7-B5A1-9D336EC831CB",
   "type": "executable",
   "sections": [
@@ -28,13 +28,13 @@
     {
       "name": "exception_catcher",
       "type": "code",
-      "size": 44,
+      "size": 32,
       "address": 2111760
     },
     {
       "name": "exception_thrower",
       "type": "code",
-      "size": 2652,
+      "size": 16,
       "address": 2108040
     }
   ]
diff --git a/lldb/tools/driver/CMakeLists.txt b/lldb/tools/driver/CMakeLists.txt
index 446bf68..67956af 100644
--- a/lldb/tools/driver/CMakeLists.txt
+++ b/lldb/tools/driver/CMakeLists.txt
@@ -34,6 +34,10 @@ add_dependencies(lldb
   ${tablegen_deps}
 )
 
+if(DEFINED LLDB_PYTHON_DLL_RELATIVE_PATH)
+  target_compile_definitions(lldb PRIVATE LLDB_PYTHON_DLL_RELATIVE_PATH="${LLDB_PYTHON_DLL_RELATIVE_PATH}")
+endif()
+
 if(LLDB_BUILD_FRAMEWORK)
   # In the build-tree, we know the exact path to the framework directory.
   # The installed framework can be in different locations.
diff --git a/lldb/tools/driver/Driver.cpp b/lldb/tools/driver/Driver.cpp
index 16cc736..ba00411 100644
--- a/lldb/tools/driver/Driver.cpp
+++ b/lldb/tools/driver/Driver.cpp
@@ -22,7 +22,10 @@
 #include "lldb/Host/MainLoop.h"
 #include "lldb/Host/MainLoopBase.h"
 #include "lldb/Utility/Status.h"
+#include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/Support/ConvertUTF.h"
+#include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/InitLLVM.h"
 #include "llvm/Support/Path.h"
@@ -30,6 +33,10 @@
 #include "llvm/Support/WithColor.h"
 #include "llvm/Support/raw_ostream.h"
 
+#ifdef _WIN32
+#include "llvm/Support/Windows/WindowsSupport.h"
+#endif
+
 #include <algorithm>
 #include <atomic>
 #include <bitset>
@@ -426,6 +433,47 @@ SBError Driver::ProcessArgs(const opt::InputArgList &args, bool &exiting) {
   return error;
 }
 
+#if defined(_WIN32) && defined(LLDB_PYTHON_DLL_RELATIVE_PATH)
+/// Returns the full path to the lldb.exe executable.
+inline std::wstring GetPathToExecutableW() {
+  // Iterate until we reach the Windows API maximum path length (32,767).
+  std::vector<WCHAR> buffer;
+  buffer.resize(MAX_PATH /*=260*/);
+  while (buffer.size() < 32767) {
+    if (GetModuleFileNameW(NULL, buffer.data(), buffer.size()) < buffer.size())
+      return std::wstring(buffer.begin(), buffer.end());
+    buffer.resize(buffer.size() * 2);
+  }
+  return L"";
+}
+
+/// Resolve the full path of the directory defined by
+/// LLDB_PYTHON_DLL_RELATIVE_PATH. If it exists, add it to the list of DLL
+/// search directories.
+void AddPythonDLLToSearchPath() {
+  std::wstring modulePath = GetPathToExecutableW();
+  if (modulePath.empty()) {
+    llvm::errs() << "error: unable to find python.dll." << '\n';
+    return;
+  }
+
+  SmallVector<char, MAX_PATH> utf8Path;
+  if (sys::windows::UTF16ToUTF8(modulePath.c_str(), modulePath.length(),
+                                utf8Path))
+    return;
+  sys::path::remove_filename(utf8Path);
+  sys::path::append(utf8Path, LLDB_PYTHON_DLL_RELATIVE_PATH);
+  sys::fs::make_absolute(utf8Path);
+
+  SmallVector<wchar_t, 1> widePath;
+  if (sys::windows::widenPath(utf8Path.data(), widePath))
+    return;
+
+  if (sys::fs::exists(utf8Path))
+    SetDllDirectoryW(widePath.data());
+}
+#endif
+
 std::string EscapeString(std::string arg) {
   std::string::size_type pos = 0;
   while ((pos = arg.find_first_of("\"\\", pos)) != std::string::npos) {
@@ -728,6 +776,10 @@ int main(int argc, char const *argv[]) {
                         "~/Library/Logs/DiagnosticReports/.\n");
 #endif
 
+#if defined(_WIN32) && defined(LLDB_PYTHON_DLL_RELATIVE_PATH)
+  AddPythonDLLToSearchPath();
+#endif
+
   // Parse arguments.
   LLDBOptTable T;
   unsigned MissingArgIndex;
diff --git a/lldb/unittests/Host/JSONTransportTest.cpp b/lldb/unittests/Host/JSONTransportTest.cpp
index 7db6508..54f1372 100644
--- a/lldb/unittests/Host/JSONTransportTest.cpp
+++ b/lldb/unittests/Host/JSONTransportTest.cpp
@@ -49,85 +49,85 @@ namespace {
 
 namespace test_protocol {
 
-struct Req {
+struct Request {
   int id = 0;
   std::string name;
   std::optional<json::Value> params;
 };
-json::Value toJSON(const Req &T) {
+json::Value toJSON(const Request &T) {
   return json::Object{{"name", T.name}, {"id", T.id}, {"params", T.params}};
 }
-bool fromJSON(const json::Value &V, Req &T, json::Path P) {
+bool fromJSON(const json::Value &V, Request &T, json::Path P) {
   json::ObjectMapper O(V, P);
   return O && O.map("name", T.name) && O.map("id", T.id) &&
          O.map("params", T.params);
 }
-bool operator==(const Req &a, const Req &b) {
+bool operator==(const Request &a, const Request &b) {
   return a.name == b.name && a.id == b.id && a.params == b.params;
 }
-inline llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const Req &V) {
+inline llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const Request &V) {
   OS << toJSON(V);
   return OS;
 }
-void PrintTo(const Req &message, std::ostream *os) {
+void PrintTo(const Request &message, std::ostream *os) {
   std::string O;
   llvm::raw_string_ostream OS(O);
   OS << message;
   *os << O;
 }
 
-struct Resp {
+struct Response {
   int id = 0;
   int errorCode = 0;
   std::optional<json::Value> result;
 };
-json::Value toJSON(const Resp &T) {
+json::Value toJSON(const Response &T) {
   return json::Object{
       {"id", T.id}, {"errorCode", T.errorCode}, {"result", T.result}};
 }
-bool fromJSON(const json::Value &V, Resp &T, json::Path P) {
+bool fromJSON(const json::Value &V, Response &T, json::Path P) {
   json::ObjectMapper O(V, P);
   return O && O.map("id", T.id) && O.mapOptional("errorCode", T.errorCode) &&
          O.map("result", T.result);
 }
-bool operator==(const Resp &a, const Resp &b) {
+bool operator==(const Response &a, const Response &b) {
   return a.id == b.id && a.errorCode == b.errorCode && a.result == b.result;
 }
-inline llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const Resp &V) {
+inline llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const Response &V) {
   OS << toJSON(V);
   return OS;
 }
-void PrintTo(const Resp &message, std::ostream *os) {
+void PrintTo(const Response &message, std::ostream *os) {
   std::string O;
   llvm::raw_string_ostream OS(O);
   OS << message;
   *os << O;
 }
 
-struct Evt {
+struct Event {
   std::string name;
   std::optional<json::Value> params;
 };
-json::Value toJSON(const Evt &T) {
+json::Value toJSON(const Event &T) {
   return json::Object{{"name", T.name}, {"params", T.params}};
 }
-bool fromJSON(const json::Value &V, Evt &T, json::Path P) {
+bool fromJSON(const json::Value &V, Event &T, json::Path P) {
   json::ObjectMapper O(V, P);
   return O && O.map("name", T.name) && O.map("params", T.params);
 }
-bool operator==(const Evt &a, const Evt &b) { return a.name == b.name; }
-inline llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const Evt &V) {
+bool operator==(const Event &a, const Event &b) { return a.name == b.name; }
+inline llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const Event &V) {
   OS << toJSON(V);
   return OS;
 }
-void PrintTo(const Evt &message, std::ostream *os) {
+void PrintTo(const Event &message, std::ostream *os) {
   std::string O;
   llvm::raw_string_ostream OS(O);
   OS << message;
   *os << O;
 }
 
-using Message = std::variant<Req, Resp, Evt>;
+using Message = std::variant<Request, Response, Event>;
 json::Value toJSON(const Message &msg) {
   return std::visit([](const auto &msg) { return toJSON(msg); }, msg);
 }
@@ -139,7 +139,7 @@ bool fromJSON(const json::Value &V, Message &msg, json::Path P) {
   }
 
   if (O->find("id") == O->end()) {
-    Evt E;
+    Event E;
     if (!fromJSON(V, E, P))
       return false;
 
@@ -148,7 +148,7 @@ bool fromJSON(const json::Value &V, Message &msg, json::Path P) {
   }
 
   if (O->get("name")) {
-    Req R;
+    Request R;
     if (!fromJSON(V, R, P))
       return false;
 
@@ -156,7 +156,7 @@ bool fromJSON(const json::Value &V, Message &msg, json::Path P) {
     return true;
   }
 
-  Resp R;
+  Response R;
   if (!fromJSON(V, R, P))
     return false;
 
@@ -187,9 +187,9 @@ bool fromJSON(const json::Value &V, MyFnResult &T, json::Path P) {
 
 struct ProtoDesc {
   using Id = int;
-  using Req = Req;
-  using Resp = Resp;
-  using Evt = Evt;
+  using Req = Request;
+  using Resp = Response;
+  using Evt = Event;
 
   static inline Id InitialId() { return 0; }
   static inline Req Make(Id id, llvm::StringRef method,
@@ -393,8 +393,8 @@ TEST_F(HTTPDelimitedJSONTransportTest, MalformedRequests) {
 }
 
 TEST_F(HTTPDelimitedJSONTransportTest, Read) {
-  Write(Req{6, "foo", std::nullopt});
-  EXPECT_CALL(message_handler, Received(Req{6, "foo", std::nullopt}));
+  Write(Request{6, "foo", std::nullopt});
+  EXPECT_CALL(message_handler, Received(Request{6, "foo", std::nullopt}));
   ASSERT_THAT_ERROR(Run(), Succeeded());
 }
 
@@ -402,17 +402,18 @@ TEST_F(HTTPDelimitedJSONTransportTest, ReadMultipleMessagesInSingleWrite) {
   InSequence seq;
   Write(
       Message{
-          Req{6, "one", std::nullopt},
+          Request{6, "one", std::nullopt},
       },
       Message{
-          Evt{"two", std::nullopt},
+          test_protocol::Event{"two", std::nullopt},
       },
       Message{
-          Resp{2, 0, std::nullopt},
+          Response{2, 0, std::nullopt},
       });
-  EXPECT_CALL(message_handler, Received(Req{6, "one", std::nullopt}));
-  EXPECT_CALL(message_handler, Received(Evt{"two", std::nullopt}));
-  EXPECT_CALL(message_handler, Received(Resp{2, 0, std::nullopt}));
+  EXPECT_CALL(message_handler, Received(Request{6, "one", std::nullopt}));
+  EXPECT_CALL(message_handler,
+              Received(test_protocol::Event{"two", std::nullopt}));
+  EXPECT_CALL(message_handler, Received(Response{2, 0, std::nullopt}));
   ASSERT_THAT_ERROR(Run(), Succeeded());
 }
 
@@ -420,18 +421,18 @@ TEST_F(HTTPDelimitedJSONTransportTest, ReadAcrossMultipleChunks) {
   std::string long_str = std::string(
       HTTPDelimitedJSONTransport<test_protocol::ProtoDesc>::kReadBufferSize * 2,
       'x');
-  Write(Req{5, long_str, std::nullopt});
-  EXPECT_CALL(message_handler, Received(Req{5, long_str, std::nullopt}));
+  Write(Request{5, long_str, std::nullopt});
+  EXPECT_CALL(message_handler, Received(Request{5, long_str, std::nullopt}));
   ASSERT_THAT_ERROR(Run(), Succeeded());
 }
 
 TEST_F(HTTPDelimitedJSONTransportTest, ReadPartialMessage) {
-  std::string message = Encode(Req{5, "foo", std::nullopt});
+  std::string message = Encode(Request{5, "foo", std::nullopt});
   auto split_at = message.size() / 2;
   std::string part1 = message.substr(0, split_at);
   std::string part2 = message.substr(split_at);
 
-  EXPECT_CALL(message_handler, Received(Req{5, "foo", std::nullopt}));
+  EXPECT_CALL(message_handler, Received(Request{5, "foo", std::nullopt}));
 
   ASSERT_THAT_EXPECTED(input.Write(part1.data(), part1.size()), Succeeded());
   loop.AddPendingCallback(
@@ -443,12 +444,12 @@ TEST_F(HTTPDelimitedJSONTransportTest, ReadPartialMessage) {
 }
 
 TEST_F(HTTPDelimitedJSONTransportTest, ReadWithZeroByteWrites) {
-  std::string message = Encode(Req{6, "foo", std::nullopt});
+  std::string message = Encode(Request{6, "foo", std::nullopt});
   auto split_at = message.size() / 2;
   std::string part1 = message.substr(0, split_at);
   std::string part2 = message.substr(split_at);
 
-  EXPECT_CALL(message_handler, Received(Req{6, "foo", std::nullopt}));
+  EXPECT_CALL(message_handler, Received(Request{6, "foo", std::nullopt}));
 
   ASSERT_THAT_EXPECTED(input.Write(part1.data(), part1.size()), Succeeded());
 
@@ -500,9 +501,11 @@ TEST_F(HTTPDelimitedJSONTransportTest, InvalidTransport) {
 }
 
 TEST_F(HTTPDelimitedJSONTransportTest, Write) {
-  ASSERT_THAT_ERROR(transport->Send(Req{7, "foo", std::nullopt}), Succeeded());
-  ASSERT_THAT_ERROR(transport->Send(Resp{5, 0, "bar"}), Succeeded());
-  ASSERT_THAT_ERROR(transport->Send(Evt{"baz", std::nullopt}), Succeeded());
+  ASSERT_THAT_ERROR(transport->Send(Request{7, "foo", std::nullopt}),
+                    Succeeded());
+  ASSERT_THAT_ERROR(transport->Send(Response{5, 0, "bar"}), Succeeded());
+  ASSERT_THAT_ERROR(transport->Send(test_protocol::Event{"baz", std::nullopt}),
+                    Succeeded());
   output.CloseWriteFileDescriptor();
   char buf[1024];
   Expected<size_t> bytes_read =
@@ -530,18 +533,20 @@ TEST_F(JSONRPCTransportTest, MalformedRequests) {
 }
 
 TEST_F(JSONRPCTransportTest, Read) {
-  Write(Message{Req{1, "foo", std::nullopt}});
-  EXPECT_CALL(message_handler, Received(Req{1, "foo", std::nullopt}));
+  Write(Message{Request{1, "foo", std::nullopt}});
+  EXPECT_CALL(message_handler, Received(Request{1, "foo", std::nullopt}));
   ASSERT_THAT_ERROR(Run(), Succeeded());
 }
 
 TEST_F(JSONRPCTransportTest, ReadMultipleMessagesInSingleWrite) {
   InSequence seq;
-  Write(Message{Req{1, "one", std::nullopt}}, Message{Evt{"two", std::nullopt}},
-        Message{Resp{3, 0, "three"}});
-  EXPECT_CALL(message_handler, Received(Req{1, "one", std::nullopt}));
-  EXPECT_CALL(message_handler, Received(Evt{"two", std::nullopt}));
-  EXPECT_CALL(message_handler, Received(Resp{3, 0, "three"}));
+  Write(Message{Request{1, "one", std::nullopt}},
+        Message{test_protocol::Event{"two", std::nullopt}},
+        Message{Response{3, 0, "three"}});
+  EXPECT_CALL(message_handler, Received(Request{1, "one", std::nullopt}));
+  EXPECT_CALL(message_handler,
+              Received(test_protocol::Event{"two", std::nullopt}));
+  EXPECT_CALL(message_handler, Received(Response{3, 0, "three"}));
   ASSERT_THAT_ERROR(Run(), Succeeded());
 }
 
@@ -550,8 +555,8 @@ TEST_F(JSONRPCTransportTest, ReadAcrossMultipleChunks) {
   // across the chunk boundary.
   std::string long_str = std::string(
       IOTransport<test_protocol::ProtoDesc>::kReadBufferSize * 2, 'x');
-  Write(Req{42, long_str, std::nullopt});
-  EXPECT_CALL(message_handler, Received(Req{42, long_str, std::nullopt}));
+  Write(Request{42, long_str, std::nullopt});
+  EXPECT_CALL(message_handler, Received(Request{42, long_str, std::nullopt}));
   ASSERT_THAT_ERROR(Run(), Succeeded());
 }
 
@@ -561,7 +566,7 @@ TEST_F(JSONRPCTransportTest, ReadPartialMessage) {
   std::string part1 = message.substr(0, 7);
   std::string part2 = message.substr(7);
 
-  EXPECT_CALL(message_handler, Received(Req{42, "foo", std::nullopt}));
+  EXPECT_CALL(message_handler, Received(Request{42, "foo", std::nullopt}));
 
   ASSERT_THAT_EXPECTED(input.Write(part1.data(), part1.size()), Succeeded());
   loop.AddPendingCallback(
@@ -591,9 +596,11 @@ TEST_F(JSONRPCTransportTest, ReaderWithUnhandledData) {
 }
 
 TEST_F(JSONRPCTransportTest, Write) {
-  ASSERT_THAT_ERROR(transport->Send(Req{11, "foo", std::nullopt}), Succeeded());
-  ASSERT_THAT_ERROR(transport->Send(Resp{14, 0, "bar"}), Succeeded());
-  ASSERT_THAT_ERROR(transport->Send(Evt{"baz", std::nullopt}), Succeeded());
+  ASSERT_THAT_ERROR(transport->Send(Request{11, "foo", std::nullopt}),
+                    Succeeded());
+  ASSERT_THAT_ERROR(transport->Send(Response{14, 0, "bar"}), Succeeded());
+  ASSERT_THAT_ERROR(transport->Send(test_protocol::Event{"baz", std::nullopt}),
+                    Succeeded());
   output.CloseWriteFileDescriptor();
   char buf[1024];
   Expected<size_t> bytes_read =
@@ -624,8 +631,8 @@ TEST_F(TransportBinderTest, OutBoundRequests) {
     EXPECT_EQ(result->c, 3);
     replied = true;
   });
-  EXPECT_CALL(remote, Received(Req{1, "add", MyFnParams{1, 2}}));
-  EXPECT_THAT_ERROR(from_remote->Send(Resp{1, 0, toJSON(MyFnResult{3})}),
+  EXPECT_CALL(remote, Received(Request{1, "add", MyFnParams{1, 2}}));
+  EXPECT_THAT_ERROR(from_remote->Send(Response{1, 0, toJSON(MyFnResult{3})}),
                     Succeeded());
   Run();
   EXPECT_TRUE(replied);
@@ -640,8 +647,8 @@ TEST_F(TransportBinderTest, OutBoundRequestsVoidParams) {
     EXPECT_EQ(result->c, 3);
     replied = true;
   });
-  EXPECT_CALL(remote, Received(Req{1, "voidParam", std::nullopt}));
-  EXPECT_THAT_ERROR(from_remote->Send(Resp{1, 0, toJSON(MyFnResult{3})}),
+  EXPECT_CALL(remote, Received(Request{1, "voidParam", std::nullopt}));
+  EXPECT_THAT_ERROR(from_remote->Send(Response{1, 0, toJSON(MyFnResult{3})}),
                     Succeeded());
   Run();
   EXPECT_TRUE(replied);
@@ -655,8 +662,9 @@ TEST_F(TransportBinderTest, OutBoundRequestsVoidResult) {
     EXPECT_THAT_ERROR(std::move(error), Succeeded());
     replied = true;
   });
-  EXPECT_CALL(remote, Received(Req{1, "voidResult", MyFnParams{4, 5}}));
-  EXPECT_THAT_ERROR(from_remote->Send(Resp{1, 0, std::nullopt}), Succeeded());
+  EXPECT_CALL(remote, Received(Request{1, "voidResult", MyFnParams{4, 5}}));
+  EXPECT_THAT_ERROR(from_remote->Send(Response{1, 0, std::nullopt}),
+                    Succeeded());
   Run();
   EXPECT_TRUE(replied);
 }
@@ -669,8 +677,9 @@ TEST_F(TransportBinderTest, OutBoundRequestsVoidParamsAndVoidResult) {
     EXPECT_THAT_ERROR(std::move(error), Succeeded());
     replied = true;
   });
-  EXPECT_CALL(remote, Received(Req{1, "voidParamAndResult", std::nullopt}));
-  EXPECT_THAT_ERROR(from_remote->Send(Resp{1, 0, std::nullopt}), Succeeded());
+  EXPECT_CALL(remote, Received(Request{1, "voidParamAndResult", std::nullopt}));
+  EXPECT_THAT_ERROR(from_remote->Send(Response{1, 0, std::nullopt}),
+                    Succeeded());
   Run();
   EXPECT_TRUE(replied);
 }
@@ -686,10 +695,10 @@ TEST_F(TransportBinderTest, InBoundRequests) {
         return MyFnResult{params.a + params.b + captured_param};
       },
       2);
-  EXPECT_THAT_ERROR(from_remote->Send(Req{1, "add", MyFnParams{3, 4}}),
+  EXPECT_THAT_ERROR(from_remote->Send(Request{1, "add", MyFnParams{3, 4}}),
                     Succeeded());
 
-  EXPECT_CALL(remote, Received(Resp{1, 0, MyFnResult{9}}));
+  EXPECT_CALL(remote, Received(Response{1, 0, MyFnResult{9}}));
   Run();
   EXPECT_TRUE(called);
 }
@@ -703,9 +712,9 @@ TEST_F(TransportBinderTest, InBoundRequestsVoidParams) {
         return MyFnResult{captured_param};
       },
       2);
-  EXPECT_THAT_ERROR(from_remote->Send(Req{2, "voidParam", std::nullopt}),
+  EXPECT_THAT_ERROR(from_remote->Send(Request{2, "voidParam", std::nullopt}),
                     Succeeded());
-  EXPECT_CALL(remote, Received(Resp{2, 0, MyFnResult{2}}));
+  EXPECT_CALL(remote, Received(Response{2, 0, MyFnResult{2}}));
   Run();
   EXPECT_TRUE(called);
 }
@@ -722,9 +731,10 @@ TEST_F(TransportBinderTest, InBoundRequestsVoidResult) {
         return llvm::Error::success();
       },
       2);
-  EXPECT_THAT_ERROR(from_remote->Send(Req{3, "voidResult", MyFnParams{3, 4}}),
-                    Succeeded());
-  EXPECT_CALL(remote, Received(Resp{3, 0, std::nullopt}));
+  EXPECT_THAT_ERROR(
+      from_remote->Send(Request{3, "voidResult", MyFnParams{3, 4}}),
+      Succeeded());
+  EXPECT_CALL(remote, Received(Response{3, 0, std::nullopt}));
   Run();
   EXPECT_TRUE(called);
 }
@@ -739,9 +749,9 @@ TEST_F(TransportBinderTest, InBoundRequestsVoidParamsAndResult) {
       },
       2);
   EXPECT_THAT_ERROR(
-      from_remote->Send(Req{4, "voidParamAndResult", std::nullopt}),
+      from_remote->Send(Request{4, "voidParamAndResult", std::nullopt}),
       Succeeded());
-  EXPECT_CALL(remote, Received(Resp{4, 0, std::nullopt}));
+  EXPECT_CALL(remote, Received(Response{4, 0, std::nullopt}));
   Run();
   EXPECT_TRUE(called);
 }
@@ -750,14 +760,14 @@ TEST_F(TransportBinderTest, InBoundRequestsVoidParamsAndResult) {
 TEST_F(TransportBinderTest, OutBoundEvents) {
   OutgoingEvent<MyFnParams> emitEvent = binder->Bind<MyFnParams>("evt");
   emitEvent(MyFnParams{1, 2});
-  EXPECT_CALL(remote, Received(Evt{"evt", MyFnParams{1, 2}}));
+  EXPECT_CALL(remote, Received(test_protocol::Event{"evt", MyFnParams{1, 2}}));
   Run();
 }
 
 TEST_F(TransportBinderTest, OutBoundEventsVoidParams) {
   OutgoingEvent<void> emitEvent = binder->Bind<void>("evt");
   emitEvent();
-  EXPECT_CALL(remote, Received(Evt{"evt", std::nullopt}));
+  EXPECT_CALL(remote, Received(test_protocol::Event{"evt", std::nullopt}));
   Run();
 }
 
@@ -773,8 +783,9 @@ TEST_F(TransportBinderTest, InBoundEvents) {
         called = true;
       },
       42);
-  EXPECT_THAT_ERROR(from_remote->Send(Evt{"evt", MyFnParams{3, 4}}),
-                    Succeeded());
+  EXPECT_THAT_ERROR(
+      from_remote->Send(test_protocol::Event{"evt", MyFnParams{3, 4}}),
+      Succeeded());
   Run();
   EXPECT_TRUE(called);
 }
@@ -788,7 +799,9 @@ TEST_F(TransportBinderTest, InBoundEventsVoidParams) {
         called = true;
       },
       42);
-  EXPECT_THAT_ERROR(from_remote->Send(Evt{"evt", std::nullopt}), Succeeded());
+  EXPECT_THAT_ERROR(
+      from_remote->Send(test_protocol::Event{"evt", std::nullopt}),
+      Succeeded());
   Run();
   EXPECT_TRUE(called);
 }
diff --git a/lldb/unittests/ScriptInterpreter/Python/PythonTestSuite.cpp b/lldb/unittests/ScriptInterpreter/Python/PythonTestSuite.cpp
index 068860e..6f5d9fd 100644
--- a/lldb/unittests/ScriptInterpreter/Python/PythonTestSuite.cpp
+++ b/lldb/unittests/ScriptInterpreter/Python/PythonTestSuite.cpp
@@ -105,6 +105,11 @@ void *lldb_private::python::LLDBSWIGPython_CastPyObjectToSBBreakpoint(
   return nullptr;
 }
 
+void *lldb_private::python::LLDBSWIGPython_CastPyObjectToSBBreakpointLocation(
+    PyObject *data) {
+  return nullptr;
+}
+
 void *lldb_private::python::LLDBSWIGPython_CastPyObjectToSBAttachInfo(
     PyObject *data) {
   return nullptr;
@@ -130,6 +135,11 @@ lldb_private::python::LLDBSWIGPython_CastPyObjectToSBStream(PyObject *data) {
   return nullptr;
 }
 
+void *
+lldb_private::python::LLDBSWIGPython_CastPyObjectToSBFrame(PyObject *data) {
+  return nullptr;
+}
+
 void *lldb_private::python::LLDBSWIGPython_CastPyObjectToSBSymbolContext(
     PyObject *data) {
   return nullptr;
diff --git a/llvm/benchmarks/Mustache.cpp b/llvm/benchmarks/Mustache.cpp
index 6d24f54..996eca41 100644
--- a/llvm/benchmarks/Mustache.cpp
+++ b/llvm/benchmarks/Mustache.cpp
@@ -8,7 +8,7 @@
 static const std::string LongHtmlString = [] {
   std::string S;
   S.reserve(500000);
-  for (int i = 0; i < 50000; ++i) {
+  for (int Idx = 0; Idx < 50000; ++Idx) {
     S += "<script>alert('xss');</script>";
   }
   return S;
@@ -153,7 +153,11 @@ static const std::string LargeOutputStringTemplate = "{{long_string}}";
 // syntaxes.
 static void BM_Mustache_StringRendering(benchmark::State &state,
                                         const std::string &TplStr) {
-  llvm::mustache::Template Tpl(TplStr);
+  llvm::BumpPtrAllocator Allocator;
+  llvm::StringSaver Saver(Allocator);
+  llvm::mustache::MustacheContext Ctx(Allocator, Saver);
+
+  llvm::mustache::Template Tpl(TplStr, Ctx);
   llvm::json::Value Data =
       llvm::json::Object({{"content", llvm::json::Value(LongHtmlString)}});
   for (auto _ : state) {
@@ -172,7 +176,11 @@ BENCHMARK_CAPTURE(BM_Mustache_StringRendering, Unescaped_Ampersand,
 // Tests the "hot render" cost of repeatedly traversing a deep and wide
 // JSON object.
 static void BM_Mustache_DeepTraversal(benchmark::State &state) {
-  llvm::mustache::Template Tpl(DeepTraversalTemplate);
+  llvm::BumpPtrAllocator Allocator;
+  llvm::StringSaver Saver(Allocator);
+  llvm::mustache::MustacheContext Ctx(Allocator, Saver);
+
+  llvm::mustache::Template Tpl(DeepTraversalTemplate, Ctx);
   for (auto _ : state) {
     std::string Result;
     llvm::raw_string_ostream OS(Result);
@@ -184,7 +192,12 @@ BENCHMARK(BM_Mustache_DeepTraversal);
 
 // Tests the "hot render" cost of pushing and popping a deep context stack.
 static void BM_Mustache_DeeplyNestedRendering(benchmark::State &state) {
-  llvm::mustache::Template Tpl(DeeplyNestedRenderingTemplate);
+
+  llvm::BumpPtrAllocator Allocator;
+  llvm::StringSaver Saver(Allocator);
+  llvm::mustache::MustacheContext Ctx(Allocator, Saver);
+
+  llvm::mustache::Template Tpl(DeeplyNestedRenderingTemplate, Ctx);
   for (auto _ : state) {
     std::string Result;
     llvm::raw_string_ostream OS(Result);
@@ -197,7 +210,11 @@ BENCHMARK(BM_Mustache_DeeplyNestedRendering);
 // Tests the performance of the loop logic when iterating over a huge number of
 // items.
 static void BM_Mustache_HugeArrayIteration(benchmark::State &state) {
-  llvm::mustache::Template Tpl(HugeArrayIterationTemplate);
+  llvm::BumpPtrAllocator Allocator;
+  llvm::StringSaver Saver(Allocator);
+  llvm::mustache::MustacheContext Ctx(Allocator, Saver);
+
+  llvm::mustache::Template Tpl(HugeArrayIterationTemplate, Ctx);
   for (auto _ : state) {
     std::string Result;
     llvm::raw_string_ostream OS(Result);
@@ -209,8 +226,12 @@ BENCHMARK(BM_Mustache_HugeArrayIteration);
 
 // Tests the performance of the parser on a large, "wide" template.
 static void BM_Mustache_ComplexTemplateParsing(benchmark::State &state) {
+  llvm::BumpPtrAllocator Allocator;
+  llvm::StringSaver Saver(Allocator);
+  llvm::mustache::MustacheContext Ctx(Allocator, Saver);
+
   for (auto _ : state) {
-    llvm::mustache::Template Tpl(ComplexTemplateParsingTemplate);
+    llvm::mustache::Template Tpl(ComplexTemplateParsingTemplate, Ctx);
     benchmark::DoNotOptimize(Tpl);
   }
 }
@@ -218,8 +239,12 @@ BENCHMARK(BM_Mustache_ComplexTemplateParsing);
 
 // Tests the performance of the parser on a small, "deep" template.
 static void BM_Mustache_SmallTemplateParsing(benchmark::State &state) {
+  llvm::BumpPtrAllocator Allocator;
+  llvm::StringSaver Saver(Allocator);
+  llvm::mustache::MustacheContext Ctx(Allocator, Saver);
+
   for (auto _ : state) {
-    llvm::mustache::Template Tpl(SmallTemplateParsingTemplate);
+    llvm::mustache::Template Tpl(SmallTemplateParsingTemplate, Ctx);
     benchmark::DoNotOptimize(Tpl);
   }
 }
@@ -227,7 +252,11 @@ BENCHMARK(BM_Mustache_SmallTemplateParsing);
 
 // Tests the performance of rendering a template that includes a partial.
 static void BM_Mustache_PartialsRendering(benchmark::State &state) {
-  llvm::mustache::Template Tpl(ComplexPartialTemplate);
+  llvm::BumpPtrAllocator Allocator;
+  llvm::StringSaver Saver(Allocator);
+  llvm::mustache::MustacheContext Ctx(Allocator, Saver);
+
+  llvm::mustache::Template Tpl(ComplexPartialTemplate, Ctx);
   Tpl.registerPartial("item_partial", ItemPartialTemplate);
   llvm::json::Value Data = HugeArrayData;
 
@@ -243,7 +272,11 @@ BENCHMARK(BM_Mustache_PartialsRendering);
 // Tests the performance of the underlying buffer management when generating a
 // very large output.
 static void BM_Mustache_LargeOutputString(benchmark::State &state) {
-  llvm::mustache::Template Tpl(LargeOutputStringTemplate);
+  llvm::BumpPtrAllocator Allocator;
+  llvm::StringSaver Saver(Allocator);
+  llvm::mustache::MustacheContext Ctx(Allocator, Saver);
+
+  llvm::mustache::Template Tpl(LargeOutputStringTemplate, Ctx);
   for (auto _ : state) {
     std::string Result;
     llvm::raw_string_ostream OS(Result);
diff --git a/llvm/docs/MLGO.rst b/llvm/docs/MLGO.rst
index 965a21b..bf3de11 100644
--- a/llvm/docs/MLGO.rst
+++ b/llvm/docs/MLGO.rst
@@ -508,7 +508,7 @@ embeddings can be computed and accessed via an ``ir2vec::Embedder`` instance.
 
    .. code-block:: c++
 
-    const ir2vec::Embedding &FuncVector = Emb->getFunctionVector();
+    ir2vec::Embedding FuncVector = Emb->getFunctionVector();
 
    Currently, ``Embedder`` can generate embeddings at three levels: Instructions,
    Basic Blocks, and Functions. Appropriate getters are provided to access the
diff --git a/llvm/include/llvm/Analysis/IR2Vec.h b/llvm/include/llvm/Analysis/IR2Vec.h
index 81409df..6bc51fe 100644
--- a/llvm/include/llvm/Analysis/IR2Vec.h
+++ b/llvm/include/llvm/Analysis/IR2Vec.h
@@ -533,21 +533,20 @@ protected:
   /// in the IR instructions to generate the vector representation.
   const float OpcWeight, TypeWeight, ArgWeight;
 
-  // Utility maps - these are used to store the vector representations of
-  // instructions, basic blocks and functions.
-  mutable Embedding FuncVector;
-  mutable BBEmbeddingsMap BBVecMap;
-  mutable InstEmbeddingsMap InstVecMap;
-
-  LLVM_ABI Embedder(const Function &F, const Vocabulary &Vocab);
+  LLVM_ABI Embedder(const Function &F, const Vocabulary &Vocab)
+      : F(F), Vocab(Vocab), Dimension(Vocab.getDimension()),
+        OpcWeight(ir2vec::OpcWeight), TypeWeight(ir2vec::TypeWeight),
+        ArgWeight(ir2vec::ArgWeight) {}
 
-  /// Function to compute embeddings. It generates embeddings for all
-  /// the instructions and basic blocks in the function F.
-  void computeEmbeddings() const;
+  /// Function to compute embeddings.
+  Embedding computeEmbeddings() const;
 
   /// Function to compute the embedding for a given basic block.
+  Embedding computeEmbeddings(const BasicBlock &BB) const;
+
+  /// Function to compute the embedding for a given instruction.
   /// Specific to the kind of embeddings being computed.
-  virtual void computeEmbeddings(const BasicBlock &BB) const = 0;
+  virtual Embedding computeEmbeddings(const Instruction &I) const = 0;
 
 public:
   virtual ~Embedder() = default;
@@ -556,23 +555,27 @@ public:
   LLVM_ABI static std::unique_ptr<Embedder>
   create(IR2VecKind Mode, const Function &F, const Vocabulary &Vocab);
 
-  /// Returns a map containing instructions and the corresponding embeddings for
-  /// the function F if it has been computed. If not, it computes the embeddings
-  /// for the function and returns the map.
-  LLVM_ABI const InstEmbeddingsMap &getInstVecMap() const;
-
-  /// Returns a map containing basic block and the corresponding embeddings for
-  /// the function F if it has been computed. If not, it computes the embeddings
-  /// for the function and returns the map.
-  LLVM_ABI const BBEmbeddingsMap &getBBVecMap() const;
+  /// Computes and returns the embedding for a given instruction in the function
+  /// F
+  LLVM_ABI Embedding getInstVector(const Instruction &I) const {
+    return computeEmbeddings(I);
+  }
 
-  /// Returns the embedding for a given basic block in the function F if it has
-  /// been computed. If not, it computes the embedding for the basic block and
-  /// returns it.
-  LLVM_ABI const Embedding &getBBVector(const BasicBlock &BB) const;
+  /// Computes and returns the embedding for a given basic block in the function
+  /// F
+  LLVM_ABI Embedding getBBVector(const BasicBlock &BB) const {
+    return computeEmbeddings(BB);
+  }
 
   /// Computes and returns the embedding for the current function.
-  LLVM_ABI const Embedding &getFunctionVector() const;
+  LLVM_ABI Embedding getFunctionVector() const { return computeEmbeddings(); }
+
+  /// Invalidate embeddings if cached. The embeddings may not be relevant
+  /// anymore when the IR changes due to transformations. In such cases, the
+  /// cached embeddings should be invalidated to ensure
+  /// correctness/recomputation. This is a no-op for SymbolicEmbedder but
+  /// removes all the cached entries in FlowAwareEmbedder.
+  virtual void invalidateEmbeddings() { return; }
 };
 
 /// Class for computing the Symbolic embeddings of IR2Vec.
@@ -580,7 +583,7 @@ public:
 /// representations obtained from the Vocabulary.
 class LLVM_ABI SymbolicEmbedder : public Embedder {
 private:
-  void computeEmbeddings(const BasicBlock &BB) const override;
+  Embedding computeEmbeddings(const Instruction &I) const override;
 
 public:
   SymbolicEmbedder(const Function &F, const Vocabulary &Vocab)
@@ -592,11 +595,15 @@ public:
 /// embeddings, and additionally capture the flow information in the IR.
 class LLVM_ABI FlowAwareEmbedder : public Embedder {
 private:
-  void computeEmbeddings(const BasicBlock &BB) const override;
+  // FlowAware embeddings would benefit from caching instruction embeddings as
+  // they are reused while computing the embeddings of other instructions.
+  mutable InstEmbeddingsMap InstVecMap;
+  Embedding computeEmbeddings(const Instruction &I) const override;
 
 public:
   FlowAwareEmbedder(const Function &F, const Vocabulary &Vocab)
       : Embedder(F, Vocab) {}
+  void invalidateEmbeddings() override { InstVecMap.clear(); }
 };
 
 } // namespace ir2vec
diff --git a/llvm/include/llvm/IR/DiagnosticInfo.h b/llvm/include/llvm/IR/DiagnosticInfo.h
index 5f7225e..a426fb0 100644
--- a/llvm/include/llvm/IR/DiagnosticInfo.h
+++ b/llvm/include/llvm/IR/DiagnosticInfo.h
@@ -20,6 +20,7 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/IR/DebugLoc.h"
+#include "llvm/Support/BranchProbability.h"
 #include "llvm/Support/CBindingWrapping.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -555,6 +556,7 @@ public:
     Argument(StringRef Key, bool B) : Key(Key), Val(B ? "true" : "false") {}
     LLVM_ABI Argument(StringRef Key, DebugLoc dl);
     LLVM_ABI Argument(StringRef Key, InstructionCost C);
+    LLVM_ABI Argument(StringRef Key, BranchProbability P);
   };
 
   /// \p PassName is the name of the pass emitting this diagnostic. \p
diff --git a/llvm/include/llvm/Support/Mustache.h b/llvm/include/llvm/Support/Mustache.h
index ee9f406..83047f2 100644
--- a/llvm/include/llvm/Support/Mustache.h
+++ b/llvm/include/llvm/Support/Mustache.h
@@ -71,6 +71,8 @@
 
 #include "Error.h"
 #include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/ilist.h"
+#include "llvm/ADT/ilist_node.h"
 #include "llvm/Support/Allocator.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/JSON.h"
@@ -84,10 +86,15 @@ using Lambda = std::function<llvm::json::Value()>;
 using SectionLambda = std::function<llvm::json::Value(std::string)>;
 
 class ASTNode;
-using AstPtr = std::unique_ptr<ASTNode>;
+using AstPtr = ASTNode *;
 using EscapeMap = DenseMap<char, std::string>;
+using ASTNodeList = iplist<ASTNode>;
 
 struct MustacheContext {
+  MustacheContext(BumpPtrAllocator &Allocator, StringSaver &Saver)
+      : Allocator(Allocator), Saver(Saver) {}
+  BumpPtrAllocator &Allocator;
+  StringSaver &Saver;
   StringMap<AstPtr> Partials;
   StringMap<Lambda> Lambdas;
   StringMap<SectionLambda> SectionLambdas;
@@ -98,7 +105,7 @@ struct MustacheContext {
 // and Lambdas that are registered with it.
 class Template {
 public:
-  LLVM_ABI Template(StringRef TemplateStr);
+  LLVM_ABI Template(StringRef TemplateStr, MustacheContext &Ctx);
 
   Template(const Template &) = delete;
 
@@ -110,7 +117,7 @@ public:
   // type.
   LLVM_ABI ~Template();
 
-  LLVM_ABI Template &operator=(Template &&Other) noexcept;
+  Template &operator=(Template &&) = delete;
 
   LLVM_ABI void render(const llvm::json::Value &Data, llvm::raw_ostream &OS);
 
@@ -126,7 +133,7 @@ public:
   LLVM_ABI void overrideEscapeCharacters(DenseMap<char, std::string> Escapes);
 
 private:
-  MustacheContext Ctx;
+  MustacheContext &Ctx;
   AstPtr Tree;
 };
 } // namespace llvm::mustache
diff --git a/llvm/include/llvm/Target/TargetSelectionDAG.td b/llvm/include/llvm/Target/TargetSelectionDAG.td
index 5e57dca..7bc90d4 100644
--- a/llvm/include/llvm/Target/TargetSelectionDAG.td
+++ b/llvm/include/llvm/Target/TargetSelectionDAG.td
@@ -205,6 +205,10 @@ def SDTSetCC : SDTypeProfile<1, 3, [        // setcc
   SDTCisInt<0>, SDTCisSameAs<1, 2>, SDTCisVT<3, OtherVT>
 ]>;
 
+def SDTFSetCC : SDTypeProfile<1, 3, [        // strict_fsetcc, strict_fsetccs
+  SDTCisInt<0>, SDTCisFP<1>, SDTCisSameAs<1, 2>, SDTCisVT<3, OtherVT>
+]>;
+
 def SDTSelect : SDTypeProfile<1, 3, [       // select
   SDTCisInt<1>, SDTCisSameAs<0, 2>, SDTCisSameAs<2, 3>
 ]>;
@@ -699,8 +703,8 @@ def strict_bf16_to_fp  : SDNode<"ISD::STRICT_BF16_TO_FP",
 def strict_fp_to_bf16  : SDNode<"ISD::STRICT_FP_TO_BF16",
                                SDTFPToIntOp, [SDNPHasChain]>;
 
-def strict_fsetcc  : SDNode<"ISD::STRICT_FSETCC",  SDTSetCC, [SDNPHasChain]>;
-def strict_fsetccs : SDNode<"ISD::STRICT_FSETCCS", SDTSetCC, [SDNPHasChain]>;
+def strict_fsetcc  : SDNode<"ISD::STRICT_FSETCC",  SDTFSetCC, [SDNPHasChain]>;
+def strict_fsetccs : SDNode<"ISD::STRICT_FSETCCS", SDTFSetCC, [SDNPHasChain]>;
 
 def get_fpenv      : SDNode<"ISD::GET_FPENV", SDTGetFPStateOp, [SDNPHasChain]>;
 def set_fpenv      : SDNode<"ISD::SET_FPENV", SDTSetFPStateOp, [SDNPHasChain]>;
diff --git a/llvm/lib/Analysis/IR2Vec.cpp b/llvm/lib/Analysis/IR2Vec.cpp
index 1794a60..85b5372 100644
--- a/llvm/lib/Analysis/IR2Vec.cpp
+++ b/llvm/lib/Analysis/IR2Vec.cpp
@@ -153,11 +153,6 @@ void Embedding::print(raw_ostream &OS) const {
 // Embedder and its subclasses
 //===----------------------------------------------------------------------===//
 
-Embedder::Embedder(const Function &F, const Vocabulary &Vocab)
-    : F(F), Vocab(Vocab), Dimension(Vocab.getDimension()),
-      OpcWeight(::OpcWeight), TypeWeight(::TypeWeight), ArgWeight(::ArgWeight),
-      FuncVector(Embedding(Dimension)) {}
-
 std::unique_ptr<Embedder> Embedder::create(IR2VecKind Mode, const Function &F,
                                            const Vocabulary &Vocab) {
   switch (Mode) {
@@ -169,110 +164,85 @@ std::unique_ptr<Embedder> Embedder::create(IR2VecKind Mode, const Function &F,
   return nullptr;
 }
 
-const InstEmbeddingsMap &Embedder::getInstVecMap() const {
-  if (InstVecMap.empty())
-    computeEmbeddings();
-  return InstVecMap;
-}
-
-const BBEmbeddingsMap &Embedder::getBBVecMap() const {
-  if (BBVecMap.empty())
-    computeEmbeddings();
-  return BBVecMap;
-}
-
-const Embedding &Embedder::getBBVector(const BasicBlock &BB) const {
-  auto It = BBVecMap.find(&BB);
-  if (It != BBVecMap.end())
-    return It->second;
-  computeEmbeddings(BB);
-  return BBVecMap[&BB];
-}
+Embedding Embedder::computeEmbeddings() const {
+  Embedding FuncVector(Dimension, 0.0);
 
-const Embedding &Embedder::getFunctionVector() const {
-  // Currently, we always (re)compute the embeddings for the function.
-  // This is cheaper than caching the vector.
-  computeEmbeddings();
-  return FuncVector;
-}
-
-void Embedder::computeEmbeddings() const {
   if (F.isDeclaration())
-    return;
-
-  FuncVector = Embedding(Dimension, 0.0);
+    return FuncVector;
 
   // Consider only the basic blocks that are reachable from entry
-  for (const BasicBlock *BB : depth_first(&F)) {
-    computeEmbeddings(*BB);
-    FuncVector += BBVecMap[BB];
-  }
+  for (const BasicBlock *BB : depth_first(&F))
+    FuncVector += computeEmbeddings(*BB);
+  return FuncVector;
 }
 
-void SymbolicEmbedder::computeEmbeddings(const BasicBlock &BB) const {
+Embedding Embedder::computeEmbeddings(const BasicBlock &BB) const {
   Embedding BBVector(Dimension, 0);
 
   // We consider only the non-debug and non-pseudo instructions
-  for (const auto &I : BB.instructionsWithoutDebug()) {
-    Embedding ArgEmb(Dimension, 0);
-    for (const auto &Op : I.operands())
-      ArgEmb += Vocab[*Op];
-    auto InstVector =
-        Vocab[I.getOpcode()] + Vocab[I.getType()->getTypeID()] + ArgEmb;
-    if (const auto *IC = dyn_cast<CmpInst>(&I))
-      InstVector += Vocab[IC->getPredicate()];
-    InstVecMap[&I] = InstVector;
-    BBVector += InstVector;
-  }
-  BBVecMap[&BB] = BBVector;
-}
-
-void FlowAwareEmbedder::computeEmbeddings(const BasicBlock &BB) const {
-  Embedding BBVector(Dimension, 0);
+  for (const auto &I : BB.instructionsWithoutDebug())
+    BBVector += computeEmbeddings(I);
+  return BBVector;
+}
+
+Embedding SymbolicEmbedder::computeEmbeddings(const Instruction &I) const {
+  // Currently, we always (re)compute the embeddings for symbolic embedder.
+  // This is cheaper than caching the vectors.
+  Embedding ArgEmb(Dimension, 0);
+  for (const auto &Op : I.operands())
+    ArgEmb += Vocab[*Op];
+  auto InstVector =
+      Vocab[I.getOpcode()] + Vocab[I.getType()->getTypeID()] + ArgEmb;
+  if (const auto *IC = dyn_cast<CmpInst>(&I))
+    InstVector += Vocab[IC->getPredicate()];
+  return InstVector;
+}
+
+Embedding FlowAwareEmbedder::computeEmbeddings(const Instruction &I) const {
+  // If we have already computed the embedding for this instruction, return it
+  auto It = InstVecMap.find(&I);
+  if (It != InstVecMap.end())
+    return It->second;
 
-  // We consider only the non-debug and non-pseudo instructions
-  for (const auto &I : BB.instructionsWithoutDebug()) {
-    // TODO: Handle call instructions differently.
-    // For now, we treat them like other instructions
-    Embedding ArgEmb(Dimension, 0);
-    for (const auto &Op : I.operands()) {
-      // If the operand is defined elsewhere, we use its embedding
-      if (const auto *DefInst = dyn_cast<Instruction>(Op)) {
-        auto DefIt = InstVecMap.find(DefInst);
-        // Fixme (#159171): Ideally we should never miss an instruction
-        // embedding here.
-        // But when we have cyclic dependencies (e.g., phi
-        // nodes), we might miss the embedding. In such cases, we fall back to
-        // using the vocabulary embedding. This can be fixed by iterating to a
-        // fixed-point, or by using a simple solver for the set of simultaneous
-        // equations.
-        // Another case when we might miss an instruction embedding is when
-        // the operand instruction is in a different basic block that has not
-        // been processed yet. This can be fixed by processing the basic blocks
-        // in a topological order.
-        if (DefIt != InstVecMap.end())
-          ArgEmb += DefIt->second;
-        else
-          ArgEmb += Vocab[*Op];
-      }
-      // If the operand is not defined by an instruction, we use the vocabulary
-      else {
-        LLVM_DEBUG(errs() << "Using embedding from vocabulary for operand: "
-                          << *Op << "=" << Vocab[*Op][0] << "\n");
+  // TODO: Handle call instructions differently.
+  // For now, we treat them like other instructions
+  Embedding ArgEmb(Dimension, 0);
+  for (const auto &Op : I.operands()) {
+    // If the operand is defined elsewhere, we use its embedding
+    if (const auto *DefInst = dyn_cast<Instruction>(Op)) {
+      auto DefIt = InstVecMap.find(DefInst);
+      // Fixme (#159171): Ideally we should never miss an instruction
+      // embedding here.
+      // But when we have cyclic dependencies (e.g., phi
+      // nodes), we might miss the embedding. In such cases, we fall back to
+      // using the vocabulary embedding. This can be fixed by iterating to a
+      // fixed-point, or by using a simple solver for the set of simultaneous
+      // equations.
+      // Another case when we might miss an instruction embedding is when
+      // the operand instruction is in a different basic block that has not
+      // been processed yet. This can be fixed by processing the basic blocks
+      // in a topological order.
+      if (DefIt != InstVecMap.end())
+        ArgEmb += DefIt->second;
+      else
         ArgEmb += Vocab[*Op];
-      }
     }
-    // Create the instruction vector by combining opcode, type, and arguments
-    // embeddings
-    auto InstVector =
-        Vocab[I.getOpcode()] + Vocab[I.getType()->getTypeID()] + ArgEmb;
-    // Add compare predicate embedding as an additional operand if applicable
-    if (const auto *IC = dyn_cast<CmpInst>(&I))
-      InstVector += Vocab[IC->getPredicate()];
-    InstVecMap[&I] = InstVector;
-    BBVector += InstVector;
+    // If the operand is not defined by an instruction, we use the
+    // vocabulary
+    else {
+      LLVM_DEBUG(errs() << "Using embedding from vocabulary for operand: "
+                        << *Op << "=" << Vocab[*Op][0] << "\n");
+      ArgEmb += Vocab[*Op];
+    }
   }
-  BBVecMap[&BB] = BBVector;
+  // Create the instruction vector by combining opcode, type, and arguments
+  // embeddings
+  auto InstVector =
+      Vocab[I.getOpcode()] + Vocab[I.getType()->getTypeID()] + ArgEmb;
+  if (const auto *IC = dyn_cast<CmpInst>(&I))
+    InstVector += Vocab[IC->getPredicate()];
+  InstVecMap[&I] = InstVector;
+  return InstVector;
 }
 
 // ==----------------------------------------------------------------------===//
@@ -695,25 +665,17 @@ PreservedAnalyses IR2VecPrinterPass::run(Module &M,
     Emb->getFunctionVector().print(OS);
 
     OS << "Basic block vectors:\n";
-    const auto &BBMap = Emb->getBBVecMap();
     for (const BasicBlock &BB : F) {
-      auto It = BBMap.find(&BB);
-      if (It != BBMap.end()) {
-        OS << "Basic block: " << BB.getName() << ":\n";
-        It->second.print(OS);
-      }
+      OS << "Basic block: " << BB.getName() << ":\n";
+      Emb->getBBVector(BB).print(OS);
     }
 
     OS << "Instruction vectors:\n";
-    const auto &InstMap = Emb->getInstVecMap();
     for (const BasicBlock &BB : F) {
       for (const Instruction &I : BB) {
-        auto It = InstMap.find(&I);
-        if (It != InstMap.end()) {
-          OS << "Instruction: ";
-          I.print(OS);
-          It->second.print(OS);
-        }
+        OS << "Instruction: ";
+        I.print(OS);
+        Emb->getInstVector(I).print(OS);
       }
     }
   }
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index c5c3866..5ffdc4e 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -19340,8 +19340,10 @@ SDValue DAGCombiner::visitFMinMax(SDNode *N) {
   EVT VT = N->getValueType(0);
   const SDNodeFlags Flags = N->getFlags();
   unsigned Opc = N->getOpcode();
-  bool PropagatesNaN = Opc == ISD::FMINIMUM || Opc == ISD::FMAXIMUM;
-  bool IsMin = Opc == ISD::FMINNUM || Opc == ISD::FMINIMUM;
+  bool PropAllNaNsToQNaNs = Opc == ISD::FMINIMUM || Opc == ISD::FMAXIMUM;
+  bool PropOnlySNaNsToQNaNs = Opc == ISD::FMINNUM || Opc == ISD::FMAXNUM;
+  bool IsMin =
+      Opc == ISD::FMINNUM || Opc == ISD::FMINIMUM || Opc == ISD::FMINIMUMNUM;
   SelectionDAG::FlagInserter FlagsInserter(DAG, N);
 
   // Constant fold.
@@ -19356,34 +19358,53 @@ SDValue DAGCombiner::visitFMinMax(SDNode *N) {
   if (const ConstantFPSDNode *N1CFP = isConstOrConstSplatFP(N1)) {
     const APFloat &AF = N1CFP->getValueAPF();
 
-    // minnum(X, nan) -> X
-    // maxnum(X, nan) -> X
-    // minimum(X, nan) -> nan
-    // maximum(X, nan) -> nan
-    if (AF.isNaN())
-      return PropagatesNaN ? N->getOperand(1) : N->getOperand(0);
+    // minnum(X, qnan) -> X
+    // maxnum(X, qnan) -> X
+    // minnum(X, snan) -> qnan
+    // maxnum(X, snan) -> qnan
+    // minimum(X, nan) -> qnan
+    // maximum(X, nan) -> qnan
+    // minimumnum(X, nan) -> X
+    // maximumnum(X, nan) -> X
+    if (AF.isNaN()) {
+      if (PropAllNaNsToQNaNs || (AF.isSignaling() && PropOnlySNaNsToQNaNs)) {
+        if (AF.isSignaling())
+          return DAG.getConstantFP(AF.makeQuiet(), SDLoc(N), VT);
+        return N->getOperand(1);
+      }
+      return N->getOperand(0);
+    }
 
     // In the following folds, inf can be replaced with the largest finite
     // float, if the ninf flag is set.
     if (AF.isInfinity() || (Flags.hasNoInfs() && AF.isLargest())) {
-      // minnum(X, -inf) -> -inf
-      // maxnum(X, +inf) -> +inf
+      // minnum(X, -inf) -> -inf (ignoring sNaN -> qNaN propagation)
+      // maxnum(X, +inf) -> +inf (ignoring sNaN -> qNaN propagation)
       // minimum(X, -inf) -> -inf if nnan
       // maximum(X, +inf) -> +inf if nnan
-      if (IsMin == AF.isNegative() && (!PropagatesNaN || Flags.hasNoNaNs()))
+      // minimumnum(X, -inf) -> -inf
+      // maximumnum(X, +inf) -> +inf
+      if (IsMin == AF.isNegative() &&
+          (!PropAllNaNsToQNaNs || Flags.hasNoNaNs()))
         return N->getOperand(1);
 
       // minnum(X, +inf) -> X if nnan
       // maxnum(X, -inf) -> X if nnan
-      // minimum(X, +inf) -> X
-      // maximum(X, -inf) -> X
-      if (IsMin != AF.isNegative() && (PropagatesNaN || Flags.hasNoNaNs()))
+      // minimum(X, +inf) -> X (ignoring quieting of sNaNs)
+      // maximum(X, -inf) -> X (ignoring quieting of sNaNs)
+      // minimumnum(X, +inf) -> X if nnan
+      // maximumnum(X, -inf) -> X if nnan
+      if (IsMin != AF.isNegative() && (PropAllNaNsToQNaNs || Flags.hasNoNaNs()))
         return N->getOperand(0);
     }
   }
 
+  // There are no VECREDUCE variants of FMINIMUMNUM or FMAXIMUMNUM
+  if (Opc == ISD::FMINIMUMNUM || Opc == ISD::FMAXIMUMNUM)
+    return SDValue();
+
   if (SDValue SD = reassociateReduction(
-          PropagatesNaN
+          PropAllNaNsToQNaNs
               ? (IsMin ? ISD::VECREDUCE_FMINIMUM : ISD::VECREDUCE_FMAXIMUM)
               : (IsMin ? ISD::VECREDUCE_FMIN : ISD::VECREDUCE_FMAX),
           Opc, SDLoc(N), VT, N0, N1, Flags))
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
index 175753f..6c11c5b 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
@@ -234,6 +234,19 @@ static bool dontUseFastISelFor(const Function &Fn) {
   });
 }
 
+static bool maintainPGOProfile(const TargetMachine &TM,
+                               CodeGenOptLevel OptLevel) {
+  if (OptLevel != CodeGenOptLevel::None)
+    return true;
+  if (TM.getPGOOption()) {
+    const PGOOptions &Options = *TM.getPGOOption();
+    return Options.Action == PGOOptions::PGOAction::IRUse ||
+           Options.Action == PGOOptions::PGOAction::SampleUse ||
+           Options.CSAction == PGOOptions::CSPGOAction::CSIRUse;
+  }
+  return false;
+}
+
 namespace llvm {
 
   //===--------------------------------------------------------------------===//
@@ -395,6 +408,7 @@ SelectionDAGISel::~SelectionDAGISel() { delete CurDAG; }
 
 void SelectionDAGISelLegacy::getAnalysisUsage(AnalysisUsage &AU) const {
   CodeGenOptLevel OptLevel = Selector->OptLevel;
+  bool RegisterPGOPasses = maintainPGOProfile(Selector->TM, Selector->OptLevel);
   if (OptLevel != CodeGenOptLevel::None)
       AU.addRequired<AAResultsWrapperPass>();
   AU.addRequired<GCModuleInfo>();
@@ -403,15 +417,15 @@ void SelectionDAGISelLegacy::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addRequired<TargetLibraryInfoWrapperPass>();
   AU.addRequired<TargetTransformInfoWrapperPass>();
   AU.addRequired<AssumptionCacheTracker>();
-  if (UseMBPI && OptLevel != CodeGenOptLevel::None)
-      AU.addRequired<BranchProbabilityInfoWrapperPass>();
+  if (UseMBPI && RegisterPGOPasses)
+    AU.addRequired<BranchProbabilityInfoWrapperPass>();
   AU.addRequired<ProfileSummaryInfoWrapperPass>();
   // AssignmentTrackingAnalysis only runs if assignment tracking is enabled for
   // the module.
   AU.addRequired<AssignmentTrackingAnalysis>();
   AU.addPreserved<AssignmentTrackingAnalysis>();
-  if (OptLevel != CodeGenOptLevel::None)
-      LazyBlockFrequencyInfoPass::getLazyBFIAnalysisUsage(AU);
+  if (RegisterPGOPasses)
+    LazyBlockFrequencyInfoPass::getLazyBFIAnalysisUsage(AU);
   MachineFunctionPass::getAnalysisUsage(AU);
 }
 
@@ -464,6 +478,7 @@ void SelectionDAGISel::initializeAnalysisResults(
   (void)MatchFilterFuncName;
 #endif
 
+  bool RegisterPGOPasses = maintainPGOProfile(TM, OptLevel);
   TII = MF->getSubtarget().getInstrInfo();
   TLI = MF->getSubtarget().getTargetLowering();
   RegInfo = &MF->getRegInfo();
@@ -474,7 +489,7 @@ void SelectionDAGISel::initializeAnalysisResults(
   auto *PSI = MAMP.getCachedResult<ProfileSummaryAnalysis>(*Fn.getParent());
   BlockFrequencyInfo *BFI = nullptr;
   FAM.getResult<BlockFrequencyAnalysis>(Fn);
-  if (PSI && PSI->hasProfileSummary() && OptLevel != CodeGenOptLevel::None)
+  if (PSI && PSI->hasProfileSummary() && RegisterPGOPasses)
     BFI = &FAM.getResult<BlockFrequencyAnalysis>(Fn);
 
   FunctionVarLocs const *FnVarLocs = nullptr;
@@ -492,7 +507,7 @@ void SelectionDAGISel::initializeAnalysisResults(
   // into account).  That's unfortunate but OK because it just means we won't
   // ask for passes that have been required anyway.
 
-  if (UseMBPI && OptLevel != CodeGenOptLevel::None)
+  if (UseMBPI && RegisterPGOPasses)
     FuncInfo->BPI = &FAM.getResult<BranchProbabilityAnalysis>(Fn);
   else
     FuncInfo->BPI = nullptr;
@@ -518,6 +533,7 @@ void SelectionDAGISel::initializeAnalysisResults(MachineFunctionPass &MFP) {
   (void)MatchFilterFuncName;
 #endif
 
+  bool RegisterPGOPasses = maintainPGOProfile(TM, OptLevel);
   TII = MF->getSubtarget().getInstrInfo();
   TLI = MF->getSubtarget().getTargetLowering();
   RegInfo = &MF->getRegInfo();
@@ -528,7 +544,7 @@ void SelectionDAGISel::initializeAnalysisResults(MachineFunctionPass &MFP) {
   AC = &MFP.getAnalysis<AssumptionCacheTracker>().getAssumptionCache(Fn);
   auto *PSI = &MFP.getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
   BlockFrequencyInfo *BFI = nullptr;
-  if (PSI && PSI->hasProfileSummary() && OptLevel != CodeGenOptLevel::None)
+  if (PSI && PSI->hasProfileSummary() && RegisterPGOPasses)
     BFI = &MFP.getAnalysis<LazyBlockFrequencyInfoPass>().getBFI();
 
   FunctionVarLocs const *FnVarLocs = nullptr;
@@ -549,7 +565,7 @@ void SelectionDAGISel::initializeAnalysisResults(MachineFunctionPass &MFP) {
   // into account).  That's unfortunate but OK because it just means we won't
   // ask for passes that have been required anyway.
 
-  if (UseMBPI && OptLevel != CodeGenOptLevel::None)
+  if (UseMBPI && RegisterPGOPasses)
     FuncInfo->BPI =
         &MFP.getAnalysis<BranchProbabilityInfoWrapperPass>().getBPI();
   else
diff --git a/llvm/lib/IR/DiagnosticInfo.cpp b/llvm/lib/IR/DiagnosticInfo.cpp
index 4f37624..8e6d654 100644
--- a/llvm/lib/IR/DiagnosticInfo.cpp
+++ b/llvm/lib/IR/DiagnosticInfo.cpp
@@ -273,6 +273,13 @@ DiagnosticInfoOptimizationBase::Argument::Argument(StringRef Key,
   C.print(OS);
 }
 
+DiagnosticInfoOptimizationBase::Argument::Argument(StringRef Key,
+                                                   BranchProbability P)
+    : Key(std::string(Key)) {
+  raw_string_ostream OS(Val);
+  P.print(OS);
+}
+
 DiagnosticInfoOptimizationBase::Argument::Argument(StringRef Key, DebugLoc Loc)
     : Key(std::string(Key)), Loc(Loc) {
   if (Loc) {
diff --git a/llvm/lib/Support/Mustache.cpp b/llvm/lib/Support/Mustache.cpp
index 47860c0..708e79d 100644
--- a/llvm/lib/Support/Mustache.cpp
+++ b/llvm/lib/Support/Mustache.cpp
@@ -20,7 +20,7 @@ using namespace llvm::mustache;
 
 namespace {
 
-using Accessor = SmallVector<std::string>;
+using Accessor = ArrayRef<StringRef>;
 
 static bool isFalsey(const json::Value &V) {
   return V.getAsNull() || (V.getAsBoolean() && !V.getAsBoolean().value()) ||
@@ -34,23 +34,32 @@ static bool isContextFalsey(const json::Value *V) {
   return isFalsey(*V);
 }
 
-static Accessor splitMustacheString(StringRef Str) {
+static Accessor splitMustacheString(StringRef Str, MustacheContext &Ctx) {
   // We split the mustache string into an accessor.
   // For example:
   //    "a.b.c" would be split into {"a", "b", "c"}
   // We make an exception for a single dot which
   // refers to the current context.
-  Accessor Tokens;
+  SmallVector<StringRef> Tokens;
   if (Str == ".") {
-    Tokens.emplace_back(Str);
-    return Tokens;
-  }
-  while (!Str.empty()) {
-    StringRef Part;
-    std::tie(Part, Str) = Str.split(".");
-    Tokens.emplace_back(Part.trim());
+    // "." is a special accessor that refers to the current context.
+    // It's a literal, so it doesn't need to be saved.
+    Tokens.push_back(".");
+  } else {
+    while (!Str.empty()) {
+      StringRef Part;
+      std::tie(Part, Str) = Str.split('.');
+      // Each part of the accessor needs to be saved to the arena
+      // to ensure it has a stable address.
+      Tokens.push_back(Ctx.Saver.save(Part.trim()));
+    }
   }
-  return Tokens;
+  // Now, allocate memory for the array of StringRefs in the arena.
+  StringRef *ArenaTokens = Ctx.Allocator.Allocate<StringRef>(Tokens.size());
+  // Copy the StringRefs from the stack vector to the arena.
+  std::copy(Tokens.begin(), Tokens.end(), ArenaTokens);
+  // Return an ArrayRef pointing to the stable arena memory.
+  return ArrayRef<StringRef>(ArenaTokens, Tokens.size());
 }
 } // namespace
 
@@ -97,23 +106,23 @@ public:
     SetDelimiter,
   };
 
-  Token(std::string Str)
-      : TokenType(Type::Text), RawBody(std::move(Str)), TokenBody(RawBody),
+  Token(StringRef Str)
+      : TokenType(Type::Text), RawBody(Str), TokenBody(RawBody),
         AccessorValue({}), Indentation(0) {};
 
-  Token(std::string RawBody, std::string TokenBody, char Identifier)
-      : RawBody(std::move(RawBody)), TokenBody(std::move(TokenBody)),
-        Indentation(0) {
+  Token(StringRef RawBody, StringRef TokenBody, char Identifier,
+        MustacheContext &Ctx)
+      : RawBody(RawBody), TokenBody(TokenBody), Indentation(0) {
     TokenType = getTokenType(Identifier);
     if (TokenType == Type::Comment)
       return;
     StringRef AccessorStr(this->TokenBody);
     if (TokenType != Type::Variable)
       AccessorStr = AccessorStr.substr(1);
-    AccessorValue = splitMustacheString(StringRef(AccessorStr).trim());
+    AccessorValue = splitMustacheString(StringRef(AccessorStr).trim(), Ctx);
   }
 
-  Accessor getAccessor() const { return AccessorValue; }
+  ArrayRef<StringRef> getAccessor() const { return AccessorValue; }
 
   Type getType() const { return TokenType; }
 
@@ -144,16 +153,16 @@ public:
 
   Type TokenType;
   // RawBody is the original string that was tokenized.
-  std::string RawBody;
+  StringRef RawBody;
   // TokenBody is the original string with the identifier removed.
-  std::string TokenBody;
-  Accessor AccessorValue;
+  StringRef TokenBody;
+  ArrayRef<StringRef> AccessorValue;
   size_t Indentation;
 };
 
 using EscapeMap = DenseMap<char, std::string>;
 
-class ASTNode {
+class ASTNode : public ilist_node<ASTNode> {
 public:
   enum Type {
     Root,
@@ -168,18 +177,19 @@ public:
   ASTNode(MustacheContext &Ctx)
       : Ctx(Ctx), Ty(Type::Root), Parent(nullptr), ParentContext(nullptr) {}
 
-  ASTNode(MustacheContext &Ctx, std::string Body, ASTNode *Parent)
-      : Ctx(Ctx), Ty(Type::Text), Body(std::move(Body)), Parent(Parent),
+  ASTNode(MustacheContext &Ctx, StringRef Body, ASTNode *Parent)
+      : Ctx(Ctx), Ty(Type::Text), Body(Body), Parent(Parent),
         ParentContext(nullptr) {}
 
   // Constructor for Section/InvertSection/Variable/UnescapeVariable Nodes
-  ASTNode(MustacheContext &Ctx, Type Ty, Accessor Accessor, ASTNode *Parent)
-      : Ctx(Ctx), Ty(Ty), Parent(Parent), AccessorValue(std::move(Accessor)),
+  ASTNode(MustacheContext &Ctx, Type Ty, ArrayRef<StringRef> Accessor,
+          ASTNode *Parent)
+      : Ctx(Ctx), Ty(Ty), Parent(Parent), AccessorValue(Accessor),
         ParentContext(nullptr) {}
 
-  void addChild(AstPtr Child) { Children.emplace_back(std::move(Child)); };
+  void addChild(AstPtr Child) { Children.push_back(Child); };
 
-  void setRawBody(std::string NewBody) { RawBody = std::move(NewBody); };
+  void setRawBody(StringRef NewBody) { RawBody = NewBody; };
 
   void setIndentation(size_t NewIndentation) { Indentation = NewIndentation; };
 
@@ -212,28 +222,27 @@ private:
   MustacheContext &Ctx;
   Type Ty;
   size_t Indentation = 0;
-  std::string RawBody;
-  std::string Body;
+  StringRef RawBody;
+  StringRef Body;
   ASTNode *Parent;
-  // TODO: switch implementation to SmallVector<T>
-  std::vector<AstPtr> Children;
-  const Accessor AccessorValue;
+  ASTNodeList Children;
+  const ArrayRef<StringRef> AccessorValue;
   const llvm::json::Value *ParentContext;
 };
 
 // A wrapper for arena allocator for ASTNodes
 static AstPtr createRootNode(MustacheContext &Ctx) {
-  return std::make_unique<ASTNode>(Ctx);
+  return new (Ctx.Allocator.Allocate<ASTNode>()) ASTNode(Ctx);
 }
 
-static AstPtr createNode(MustacheContext &Ctx, ASTNode::Type T, Accessor A,
-                         ASTNode *Parent) {
-  return std::make_unique<ASTNode>(Ctx, T, std::move(A), Parent);
+static AstPtr createNode(MustacheContext &Ctx, ASTNode::Type T,
+                         ArrayRef<StringRef> A, ASTNode *Parent) {
+  return new (Ctx.Allocator.Allocate<ASTNode>()) ASTNode(Ctx, T, A, Parent);
 }
 
-static AstPtr createTextNode(MustacheContext &Ctx, std::string Body,
+static AstPtr createTextNode(MustacheContext &Ctx, StringRef Body,
                              ASTNode *Parent) {
-  return std::make_unique<ASTNode>(Ctx, std::move(Body), Parent);
+  return new (Ctx.Allocator.Allocate<ASTNode>()) ASTNode(Ctx, Body, Parent);
 }
 
 // Function to check if there is meaningful text behind.
@@ -295,9 +304,9 @@ static void stripTokenAhead(SmallVectorImpl<Token> &Tokens, size_t Idx) {
   StringRef NextTokenBody = NextToken.TokenBody;
   // Cut off the leading newline which could be \n or \r\n.
   if (NextTokenBody.starts_with("\r\n"))
-    NextToken.TokenBody = NextTokenBody.substr(2).str();
+    NextToken.TokenBody = NextTokenBody.substr(2);
   else if (NextTokenBody.starts_with("\n"))
-    NextToken.TokenBody = NextTokenBody.substr(1).str();
+    NextToken.TokenBody = NextTokenBody.substr(1);
 }
 
 // Adjust previous token body if there no text behind.
@@ -312,7 +321,7 @@ void stripTokenBefore(SmallVectorImpl<Token> &Tokens, size_t Idx,
   StringRef PrevTokenBody = PrevToken.TokenBody;
   StringRef Unindented = PrevTokenBody.rtrim(" \r\t\v");
   size_t Indentation = PrevTokenBody.size() - Unindented.size();
-  PrevToken.TokenBody = Unindented.str();
+  PrevToken.TokenBody = Unindented;
   CurrentToken.setIndentation(Indentation);
 }
 
@@ -402,21 +411,20 @@ static Tag findNextTag(StringRef Template, size_t StartPos, StringRef Open,
 }
 
 static std::optional<std::pair<StringRef, StringRef>>
-processTag(const Tag &T, SmallVectorImpl<Token> &Tokens) {
+processTag(const Tag &T, SmallVectorImpl<Token> &Tokens, MustacheContext &Ctx) {
   LLVM_DEBUG(dbgs() << "[Tag] " << T.FullMatch << ", Content: " << T.Content
                     << ", Kind: " << tagKindToString(T.TagKind) << "\n");
   if (T.TagKind == Tag::Kind::Triple) {
-    Tokens.emplace_back(T.FullMatch.str(), "&" + T.Content.str(), '&');
+    Tokens.emplace_back(T.FullMatch, Ctx.Saver.save("&" + T.Content), '&', Ctx);
     return std::nullopt;
   }
   StringRef Interpolated = T.Content;
-  std::string RawBody = T.FullMatch.str();
   if (!Interpolated.trim().starts_with("=")) {
     char Front = Interpolated.empty() ? ' ' : Interpolated.trim().front();
-    Tokens.emplace_back(RawBody, Interpolated.str(), Front);
+    Tokens.emplace_back(T.FullMatch, Interpolated, Front, Ctx);
     return std::nullopt;
   }
-  Tokens.emplace_back(RawBody, Interpolated.str(), '=');
+  Tokens.emplace_back(T.FullMatch, Interpolated, '=', Ctx);
   StringRef DelimSpec = Interpolated.trim();
   DelimSpec = DelimSpec.drop_front(1);
   DelimSpec = DelimSpec.take_until([](char C) { return C == '='; });
@@ -432,7 +440,7 @@ processTag(const Tag &T, SmallVectorImpl<Token> &Tokens) {
 // The mustache spec allows {{{ }}} to unescape variables,
 // but we don't support that here. An unescape variable
 // is represented only by {{& variable}}.
-static SmallVector<Token> tokenize(StringRef Template) {
+static SmallVector<Token> tokenize(StringRef Template, MustacheContext &Ctx) {
   LLVM_DEBUG(dbgs() << "[Tokenize Template] \"" << Template << "\"\n");
   SmallVector<Token> Tokens;
   SmallString<8> Open("{{");
@@ -446,19 +454,17 @@ static SmallVector<Token> tokenize(StringRef Template) {
 
     if (T.TagKind == Tag::Kind::None) {
       // No more tags, the rest is text.
-      Tokens.emplace_back(Template.substr(Start).str());
-      LLVM_DEBUG(dbgs() << "  No more tags. Created final Text token: \""
-                        << Template.substr(Start) << "\"\n");
+      Tokens.emplace_back(Template.substr(Start));
       break;
     }
 
     // Add the text before the tag.
     if (T.StartPosition > Start) {
       StringRef Text = Template.substr(Start, T.StartPosition - Start);
-      Tokens.emplace_back(Text.str());
+      Tokens.emplace_back(Text);
     }
 
-    if (auto NewDelims = processTag(T, Tokens)) {
+    if (auto NewDelims = processTag(T, Tokens, Ctx)) {
       std::tie(Open, Close) = *NewDelims;
     }
 
@@ -614,20 +620,20 @@ void Parser::parseSection(ASTNode *Parent, ASTNode::Type Ty,
                           const Accessor &A) {
   AstPtr CurrentNode = createNode(Ctx, Ty, A, Parent);
   size_t Start = CurrentPtr;
-  parseMustache(CurrentNode.get());
+  parseMustache(CurrentNode);
   const size_t End = CurrentPtr - 1;
-  std::string RawBody;
+  SmallString<128> RawBody;
   for (std::size_t I = Start; I < End; I++)
     RawBody += Tokens[I].RawBody;
-  CurrentNode->setRawBody(std::move(RawBody));
-  Parent->addChild(std::move(CurrentNode));
+  CurrentNode->setRawBody(Ctx.Saver.save(StringRef(RawBody)));
+  Parent->addChild(CurrentNode);
 }
 
 AstPtr Parser::parse() {
-  Tokens = tokenize(TemplateStr);
+  Tokens = tokenize(TemplateStr, Ctx);
   CurrentPtr = 0;
   AstPtr RootNode = createRootNode(Ctx);
-  parseMustache(RootNode.get());
+  parseMustache(RootNode);
   return RootNode;
 }
 
@@ -636,31 +642,29 @@ void Parser::parseMustache(ASTNode *Parent) {
   while (CurrentPtr < Tokens.size()) {
     Token CurrentToken = Tokens[CurrentPtr];
     CurrentPtr++;
-    Accessor A = CurrentToken.getAccessor();
+    ArrayRef<StringRef> A = CurrentToken.getAccessor();
     AstPtr CurrentNode;
 
     switch (CurrentToken.getType()) {
     case Token::Type::Text: {
-      CurrentNode =
-          createTextNode(Ctx, std::move(CurrentToken.TokenBody), Parent);
-      Parent->addChild(std::move(CurrentNode));
+      CurrentNode = createTextNode(Ctx, CurrentToken.TokenBody, Parent);
+      Parent->addChild(CurrentNode);
       break;
     }
     case Token::Type::Variable: {
-      CurrentNode = createNode(Ctx, ASTNode::Variable, std::move(A), Parent);
-      Parent->addChild(std::move(CurrentNode));
+      CurrentNode = createNode(Ctx, ASTNode::Variable, A, Parent);
+      Parent->addChild(CurrentNode);
       break;
     }
     case Token::Type::UnescapeVariable: {
-      CurrentNode =
-          createNode(Ctx, ASTNode::UnescapeVariable, std::move(A), Parent);
-      Parent->addChild(std::move(CurrentNode));
+      CurrentNode = createNode(Ctx, ASTNode::UnescapeVariable, A, Parent);
+      Parent->addChild(CurrentNode);
       break;
     }
     case Token::Type::Partial: {
-      CurrentNode = createNode(Ctx, ASTNode::Partial, std::move(A), Parent);
+      CurrentNode = createNode(Ctx, ASTNode::Partial, A, Parent);
       CurrentNode->setIndentation(CurrentToken.getIndentation());
-      Parent->addChild(std::move(CurrentNode));
+      Parent->addChild(CurrentNode);
       break;
     }
     case Token::Type::SectionOpen: {
@@ -694,8 +698,7 @@ static void toMustacheString(const json::Value &Data, raw_ostream &OS) {
     return;
   }
   case json::Value::String: {
-    auto Str = *Data.getAsString();
-    OS << Str.str();
+    OS << *Data.getAsString();
     return;
   }
 
@@ -727,7 +730,7 @@ void ASTNode::renderPartial(const json::Value &CurrentCtx,
                     << ", Indentation:" << Indentation << "\n");
   auto Partial = Ctx.Partials.find(AccessorValue[0]);
   if (Partial != Ctx.Partials.end())
-    renderPartial(CurrentCtx, OS, Partial->getValue().get());
+    renderPartial(CurrentCtx, OS, Partial->getValue());
 }
 
 void ASTNode::renderVariable(const json::Value &CurrentCtx,
@@ -858,8 +861,8 @@ const json::Value *ASTNode::findContext() {
 
 void ASTNode::renderChild(const json::Value &Contexts,
                           MustacheOutputStream &OS) {
-  for (AstPtr &Child : Children)
-    Child->render(Contexts, OS);
+  for (ASTNode &Child : Children)
+    Child.render(Contexts, OS);
 }
 
 void ASTNode::renderPartial(const json::Value &Contexts,
@@ -869,7 +872,7 @@ void ASTNode::renderPartial(const json::Value &Contexts,
   Partial->render(Contexts, IS);
 }
 
-void ASTNode::renderLambdas(const json::Value &Contexts,
+void ASTNode::renderLambdas(const llvm::json::Value &Contexts,
                             MustacheOutputStream &OS, Lambda &L) {
   json::Value LambdaResult = L();
   std::string LambdaStr;
@@ -886,9 +889,9 @@ void ASTNode::renderLambdas(const json::Value &Contexts,
   LambdaNode->render(Contexts, OS);
 }
 
-void ASTNode::renderSectionLambdas(const json::Value &Contexts,
+void ASTNode::renderSectionLambdas(const llvm::json::Value &Contexts,
                                    MustacheOutputStream &OS, SectionLambda &L) {
-  json::Value Return = L(RawBody);
+  json::Value Return = L(RawBody.str());
   if (isFalsey(Return))
     return;
   std::string LambdaStr;
@@ -899,15 +902,16 @@ void ASTNode::renderSectionLambdas(const json::Value &Contexts,
   LambdaNode->render(Contexts, OS);
 }
 
-void Template::render(const json::Value &Data, llvm::raw_ostream &OS) {
+void Template::render(const llvm::json::Value &Data, llvm::raw_ostream &OS) {
   RawMustacheOutputStream MOS(OS);
   Tree->render(Data, MOS);
 }
 
 void Template::registerPartial(std::string Name, std::string Partial) {
-  Parser P(Partial, Ctx);
+  StringRef SavedPartial = Ctx.Saver.save(Partial);
+  Parser P(SavedPartial, Ctx);
   AstPtr PartialTree = P.parse();
-  Ctx.Partials.insert(std::make_pair(Name, std::move(PartialTree)));
+  Ctx.Partials.insert(std::make_pair(Name, PartialTree));
 }
 
 void Template::registerLambda(std::string Name, Lambda L) {
@@ -922,7 +926,7 @@ void Template::overrideEscapeCharacters(EscapeMap E) {
   Ctx.Escapes = std::move(E);
 }
 
-Template::Template(StringRef TemplateStr) {
+Template::Template(StringRef TemplateStr, MustacheContext &Ctx) : Ctx(Ctx) {
   Parser P(TemplateStr, Ctx);
   Tree = P.parse();
   // The default behavior is to escape html entities.
@@ -935,18 +939,12 @@ Template::Template(StringRef TemplateStr) {
 }
 
 Template::Template(Template &&Other) noexcept
-    : Ctx(std::move(Other.Ctx)), Tree(std::move(Other.Tree)) {}
+    : Ctx(Other.Ctx), Tree(Other.Tree) {
+  Other.Tree = nullptr;
+}
 
 Template::~Template() = default;
 
-Template &Template::operator=(Template &&Other) noexcept {
-  if (this != &Other) {
-    Ctx = std::move(Other.Ctx);
-    Tree = std::move(Other.Tree);
-    Other.Tree = nullptr;
-  }
-  return *this;
-}
 } // namespace llvm::mustache
 
 #undef DEBUG_TYPE
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index dc8e7c8..31b3d18 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1458,6 +1458,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
 
       setPartialReduceMLAAction(MLAOps, MVT::v4i32, MVT::v16i8, Legal);
       setPartialReduceMLAAction(MLAOps, MVT::v2i32, MVT::v8i8, Legal);
+      setPartialReduceMLAAction(MLAOps, MVT::v2i32, MVT::v16i8, Custom);
       setPartialReduceMLAAction(MLAOps, MVT::v2i64, MVT::v16i8, Custom);
 
       if (Subtarget->hasMatMulInt8()) {
@@ -30769,6 +30770,17 @@ AArch64TargetLowering::LowerPARTIAL_REDUCE_MLA(SDValue Op,
       ResultVT.isFixedLengthVector() &&
       useSVEForFixedLengthVectorVT(ResultVT, /*OverrideNEON=*/true);
 
+  // We can handle this case natively by accumulating into a wider
+  // zero-padded vector.
+  if (!ConvertToScalable && ResultVT == MVT::v2i32 && OpVT == MVT::v16i8) {
+    SDValue ZeroVec = DAG.getConstant(0, DL, MVT::v4i32);
+    SDValue WideAcc = DAG.getInsertSubvector(DL, ZeroVec, Acc, 0);
+    SDValue Wide =
+        DAG.getNode(Op.getOpcode(), DL, MVT::v4i32, WideAcc, LHS, RHS);
+    SDValue Reduced = DAG.getNode(AArch64ISD::ADDP, DL, MVT::v4i32, Wide, Wide);
+    return DAG.getExtractSubvector(DL, MVT::v2i32, Reduced, 0);
+  }
+
   if (ConvertToScalable) {
     ResultVT = getContainerForFixedLengthVector(DAG, ResultVT);
     OpVT = getContainerForFixedLengthVector(DAG, LHS.getValueType());
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index bc047a4a..a1fb665 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -651,7 +651,7 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
   // Custom conversions to/from v2i8.
   setOperationAction(ISD::BITCAST, MVT::v2i8, Custom);
 
-  // Only logical ops can be done on v4i8 directly, others must be done
+  // Only logical ops can be done on v4i8/v2i32 directly, others must be done
   // elementwise.
   setOperationAction(
       {ISD::ABS,         ISD::ADD,        ISD::ADDC,        ISD::ADDE,
@@ -669,7 +669,7 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
        ISD::UMIN,        ISD::UMULO,      ISD::UMUL_LOHI,   ISD::UREM,
        ISD::USHLSAT,     ISD::USUBO,      ISD::USUBO_CARRY, ISD::VSELECT,
        ISD::USUBSAT},
-      MVT::v4i8, Expand);
+      {MVT::v4i8, MVT::v2i32}, Expand);
 
   // Operations not directly supported by NVPTX.
   for (MVT VT : {MVT::bf16, MVT::f16, MVT::v2bf16, MVT::v2f16, MVT::f32,
@@ -689,7 +689,7 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Legal);
   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal);
   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
-  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Expand);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, {MVT::v2i16, MVT::v2i32}, Expand);
 
   setOperationAction(ISD::SHL_PARTS, MVT::i32  , Custom);
   setOperationAction(ISD::SRA_PARTS, MVT::i32  , Custom);
diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td
index 40c05e8..333b693 100644
--- a/llvm/lib/Target/RISCV/RISCVFeatures.td
+++ b/llvm/lib/Target/RISCV/RISCVFeatures.td
@@ -1823,6 +1823,11 @@ def TuneConditionalCompressedMoveFusion
 def HasConditionalMoveFusion : Predicate<"Subtarget->hasConditionalMoveFusion()">;
 def NoConditionalMoveFusion  : Predicate<"!Subtarget->hasConditionalMoveFusion()">;
 
+def TuneHasSingleElementVecFP64
+    : SubtargetFeature<"single-element-vec-fp64", "HasSingleElementVectorFP64", "true",
+                       "Certain vector FP64 operations produce a single result "
+                       "element per cycle">;
+
 def TuneMIPSP8700
     : SubtargetFeature<"mips-p8700", "RISCVProcFamily", "MIPSP8700",
                        "MIPS p8700 processor">;
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td
index 447f05c..f2724c41 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td
@@ -1636,7 +1636,7 @@ def : QCISELECTCCIPat<SETNE,  QC_SELECTNEI>;
 }
 
 let Predicates = [HasVendorXqcilsm, IsRV32] in {
-def : Pat<(qc_setwmi GPR:$rs3, GPR:$rs1, tuimm5nonzero:$uimm5, tuimm7_lsb00:$uimm7),
+def : Pat<(qc_setwmi (i32 GPR:$rs3), GPR:$rs1, tuimm5nonzero:$uimm5, tuimm7_lsb00:$uimm7),
           (QC_SETWMI GPR:$rs3, GPR:$rs1, tuimm5nonzero:$uimm5, tuimm7_lsb00:$uimm7)>;
 } // Predicates = [HasVendorXqcilsm, IsRV32]
 
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td
index e519b72..57fbaa0 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td
@@ -634,56 +634,56 @@ def : PatGpr<bswap, REV8_RV64, i64>;
 
 let Predicates = [HasStdExtZbkb] in {
 def : Pat<(or (and (shl GPR:$rs2, (XLenVT 8)), 0xFFFF),
-              (zexti8 (XLenVT GPR:$rs1))),
-          (PACKH GPR:$rs1, GPR:$rs2)>;
-def : Pat<(or (shl (zexti8 (XLenVT GPR:$rs2)), (XLenVT 8)),
-              (zexti8 (XLenVT GPR:$rs1))),
-          (PACKH GPR:$rs1, GPR:$rs2)>;
+              zexti8:$rs1),
+          (PACKH zexti8:$rs1, GPR:$rs2)>;
+def : Pat<(or (shl zexti8:$rs2, (XLenVT 8)),
+              zexti8:$rs1),
+          (PACKH zexti8:$rs1, zexti8:$rs2)>;
 def : Pat<(and (or (shl GPR:$rs2, (XLenVT 8)),
-                   (zexti8 (XLenVT GPR:$rs1))), 0xFFFF),
-          (PACKH GPR:$rs1, GPR:$rs2)>;
+                   zexti8:$rs1), 0xFFFF),
+          (PACKH zexti8:$rs1, GPR:$rs2)>;
 
 def : Pat<(binop_allhusers<or> (shl GPR:$rs2, (XLenVT 8)),
-                               (zexti8 (XLenVT GPR:$rs1))),
-          (PACKH GPR:$rs1, GPR:$rs2)>;
+                               zexti8:$rs1),
+          (PACKH zexti8:$rs1, GPR:$rs2)>;
 } // Predicates = [HasStdExtZbkb]
 
 let Predicates = [HasStdExtZbkb, IsRV32] in {
-def : Pat<(i32 (or (zexti16 (i32 GPR:$rs1)), (shl GPR:$rs2, (i32 16)))),
-          (PACK GPR:$rs1, GPR:$rs2)>;
+def : Pat<(i32 (or zexti16:$rs1, (shl GPR:$rs2, (i32 16)))),
+          (PACK zexti16:$rs1, GPR:$rs2)>;
 
-def : Pat<(or (shl GPR:$rs2, (XLenVT 24)),
-              (shl (zexti8 (XLenVT GPR:$rs1)), (XLenVT 16))),
-          (SLLI (XLenVT (PACKH GPR:$rs1, GPR:$rs2)), (XLenVT 16))>;
+def : Pat<(i32 (or (shl GPR:$rs2, (XLenVT 24)),
+                   (shl zexti8:$rs1, (XLenVT 16)))),
+          (SLLI (XLenVT (PACKH zexti8:$rs1, GPR:$rs2)), (XLenVT 16))>;
 
 // Match a pattern of 2 bytes being inserted into bits [31:16], with bits
 // bits [15:0] coming from a zero extended value. We can use pack with packh for
 // bits [31:16]. If bits [15:0] can also be a packh, it can be matched
 // separately.
-def : Pat<(or (or (shl GPR:$op1rs2, (XLenVT 24)),
-                  (shl (zexti8 (XLenVT GPR:$op1rs1)), (XLenVT 16))),
-              (zexti16 (XLenVT GPR:$rs1))),
-          (PACK (XLenVT GPR:$rs1),
-                (XLenVT (PACKH GPR:$op1rs1, GPR:$op1rs2)))>;
+def : Pat<(i32 (or (or (shl GPR:$op1rs2, (XLenVT 24)),
+                       (shl zexti8:$op1rs1, (XLenVT 16))),
+                   zexti16:$rs1)),
+          (PACK zexti16:$rs1,
+                (XLenVT (PACKH zexti8:$op1rs1, GPR:$op1rs2)))>;
 }
 
 let Predicates = [HasStdExtZbkb, IsRV64] in {
-def : Pat<(i64 (or (zexti32 (i64 GPR:$rs1)), (shl GPR:$rs2, (i64 32)))),
-          (PACK GPR:$rs1, GPR:$rs2)>;
+def : Pat<(i64 (or zexti32:$rs1, (shl GPR:$rs2, (i64 32)))),
+          (PACK zexti32:$rs1, GPR:$rs2)>;
 
-def : Pat<(or (shl (zexti8 (XLenVT GPR:$rs2)), (XLenVT 24)),
-              (shl (zexti8 (XLenVT GPR:$rs1)), (XLenVT 16))),
-          (SLLI (XLenVT (PACKH GPR:$rs1, GPR:$rs2)), (XLenVT 16))>;
+def : Pat<(i64 (or (shl zexti8:$rs2, (XLenVT 24)),
+                   (shl zexti8:$rs1, (XLenVT 16)))),
+          (SLLI (XLenVT (PACKH zexti8:$rs1, zexti8:$rs2)), (XLenVT 16))>;
 def : Pat<(binop_allwusers<or> (shl GPR:$rs2, (XLenVT 24)),
-                               (shl (zexti8 (XLenVT GPR:$rs1)), (XLenVT 16))),
-          (SLLI (XLenVT (PACKH GPR:$rs1, GPR:$rs2)), (XLenVT 16))>;
+                               (shl zexti8:$rs1, (XLenVT 16))),
+          (SLLI (XLenVT (PACKH zexti8:$rs1, GPR:$rs2)), (XLenVT 16))>;
 
 def : Pat<(binop_allwusers<or> (shl GPR:$rs2, (i64 16)),
-                               (zexti16 (i64 GPR:$rs1))),
-          (PACKW GPR:$rs1, GPR:$rs2)>;
+                               zexti16:$rs1),
+          (PACKW zexti16:$rs1, GPR:$rs2)>;
 def : Pat<(i64 (or (sext_inreg (shl GPR:$rs2, (i64 16)), i32),
-                   (zexti16 (i64 GPR:$rs1)))),
-          (PACKW GPR:$rs1, GPR:$rs2)>;
+                   zexti16:$rs1)),
+          (PACKW zexti16:$rs1, GPR:$rs2)>;
 
 // Match a pattern of 2 bytes being inserted into bits [31:16], with bits
 // bits [15:0] coming from a zero extended value, and bits [63:32] being
@@ -691,35 +691,35 @@ def : Pat<(i64 (or (sext_inreg (shl GPR:$rs2, (i64 16)), i32),
 // also be a packh, it can be matched separately.
 def : Pat<(binop_allwusers<or>
                (or (shl GPR:$op1rs2, (XLenVT 24)),
-                   (shl (zexti8 (XLenVT GPR:$op1rs1)), (XLenVT 16))),
-               (zexti16 (XLenVT GPR:$rs1))),
-          (PACKW GPR:$rs1, (XLenVT (PACKH GPR:$op1rs1, GPR:$op1rs2)))>;
+                   (shl zexti8:$op1rs1, (XLenVT 16))),
+               zexti16:$rs1),
+          (PACKW zexti16:$rs1, (XLenVT (PACKH zexti8:$op1rs1, GPR:$op1rs2)))>;
 // We need to manually reassociate the patterns because of the binop_allwusers.
 def : Pat<(binop_allwusers<or>
-               (or (zexti16 (XLenVT GPR:$rs1)),
-                   (shl (zexti8 (XLenVT GPR:$op1rs1)), (XLenVT 16))),
+               (or zexti16:$rs1,
+                   (shl zexti8:$op1rs1, (XLenVT 16))),
                (shl GPR:$op1rs2, (XLenVT 24))),
-          (PACKW GPR:$rs1, (XLenVT (PACKH GPR:$op1rs1, GPR:$op1rs2)))>;
+          (PACKW zexti16:$rs1, (XLenVT (PACKH zexti8:$op1rs1, GPR:$op1rs2)))>;
 def : Pat<(binop_allwusers<or>
-               (or (zexti16 (XLenVT GPR:$rs1)),
+               (or zexti16:$rs1,
                    (shl GPR:$op1rs2, (XLenVT 24))),
-               (shl (zexti8 (XLenVT GPR:$op1rs1)), (XLenVT 16))),
-          (PACKW GPR:$rs1, (XLenVT (PACKH GPR:$op1rs1, GPR:$op1rs2)))>;
+               (shl zexti8:$op1rs1, (XLenVT 16))),
+          (PACKW zexti16:$rs1, (XLenVT (PACKH zexti8:$op1rs1, GPR:$op1rs2)))>;
 
 def : Pat<(i64 (or (or (zexti16 (XLenVT GPR:$rs1)),
-                       (shl (zexti8 (XLenVT GPR:$op1rs1)), (XLenVT 16))),
+                       (shl zexti8:$op1rs1, (XLenVT 16))),
                    (sext_inreg (shl GPR:$op1rs2, (XLenVT 24)), i32))),
-          (PACKW GPR:$rs1, (XLenVT (PACKH GPR:$op1rs1, GPR:$op1rs2)))>;
+          (PACKW GPR:$rs1, (XLenVT (PACKH zexti8:$op1rs1, GPR:$op1rs2)))>;
 
 // Match a pattern of 2 halfwords being inserted into bits [63:32], with bits
 // bits [31:0] coming from a zero extended value. We can use pack with packw for
 // bits [63:32]. If bits [63:31] can also be a packw, it can be matched
 // separately.
 def : Pat<(or (or (shl GPR:$op1rs2, (i64 48)),
-                  (shl (zexti16 (i64 GPR:$op1rs1)), (i64 32))),
-              (zexti32 (i64 GPR:$rs1))),
-          (PACK (XLenVT GPR:$rs1),
-                (XLenVT (PACKW GPR:$op1rs1, GPR:$op1rs2)))>;
+                  (shl zexti16:$op1rs1, (i64 32))),
+              zexti32:$rs1),
+          (PACK zexti32:$rs1,
+                (XLenVT (PACKW zexti16:$op1rs1, GPR:$op1rs2)))>;
 } // Predicates = [HasStdExtZbkb, IsRV64]
 
 let Predicates = [HasStdExtZbb, IsRV32] in
diff --git a/llvm/lib/Target/RISCV/RISCVInstrPredicates.td b/llvm/lib/Target/RISCV/RISCVInstrPredicates.td
index 6d86aff..3658817 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrPredicates.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrPredicates.td
@@ -14,6 +14,10 @@
 // otherwise.
 def VLDSX0Pred : MCSchedPredicate<CheckRegOperand<3, X0>>;
 
+// This scheduling predicate is true when subtarget feature TuneHasSingleElementVecFP64
+// is enabled.
+def SingleElementVecFP64SchedPred : FeatureSchedPredicate<TuneHasSingleElementVecFP64>;
+
 // Returns true if this is the sext.w pattern, addiw rd, rs1, 0.
 def isSEXT_W
     : TIIPredicate<"isSEXT_W",
diff --git a/llvm/lib/Target/RISCV/RISCVProcessors.td b/llvm/lib/Target/RISCV/RISCVProcessors.td
index 17a7948..e86431f 100644
--- a/llvm/lib/Target/RISCV/RISCVProcessors.td
+++ b/llvm/lib/Target/RISCV/RISCVProcessors.td
@@ -338,7 +338,8 @@ def SIFIVE_X390 : RISCVProcessorModel<"sifive-x390",
                                        FeatureStdExtZvl1024b,
                                        FeatureVendorXSiFivecdiscarddlone,
                                        FeatureVendorXSiFivecflushdlone],
-                                      SiFiveIntelligenceTuneFeatures>;
+                                      !listconcat(SiFiveIntelligenceTuneFeatures,
+                                                  [TuneHasSingleElementVecFP64])>;
 
 defvar SiFiveP400TuneFeatures = [TuneNoDefaultUnroll,
                                  TuneConditionalCompressedMoveFusion,
diff --git a/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td b/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td
index 3e07eff..f863392a 100644
--- a/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td
+++ b/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td
@@ -317,7 +317,6 @@ multiclass SiFive7WriteResBase<int VLEN,
     ProcResourceKind VL, ProcResourceKind VS,
     ProcResourceKind VCQ,
     SiFive7FPLatencies fpLatencies,
-    bit isFP64Throttled = false,
     bit hasFastGather = false> {
 
   // Branching
@@ -832,29 +831,56 @@ multiclass SiFive7WriteResBase<int VLEN,
   // 13. Vector Floating-Point Instructions
   foreach mx = SchedMxListF in {
     foreach sew = SchedSEWSet<mx, isF=1>.val in {
-      defvar Cycles = !if(!and(isFP64Throttled, !eq(sew, 64)),
-                          SiFive7GetCyclesOnePerElement<mx, sew, VLEN>.c,
-                          SiFive7GetCyclesDefault<mx>.c);
-      defvar Lat8 = !if(!and(isFP64Throttled, !eq(sew, 64)), Cycles, 8);
-      defvar VA = !if(!and(isFP64Throttled, !eq(sew, 64)), VA1, VA1OrVA2);
       defvar IsWorstCase = SiFive7IsWorstCaseMXSEW<mx, sew, SchedMxListF, isF=1>.c;
-      let Latency = Lat8, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in {
-        defm : LMULSEWWriteResMXSEW<"WriteVFALUV",  [VCQ, VA], mx, sew, IsWorstCase>;
-        defm : LMULSEWWriteResMXSEW<"WriteVFALUF",  [VCQ, VA], mx, sew, IsWorstCase>;
-        defm : LMULSEWWriteResMXSEW<"WriteVFMulV",  [VCQ, VA], mx, sew, IsWorstCase>;
-        defm : LMULSEWWriteResMXSEW<"WriteVFMulF",  [VCQ, VA], mx, sew, IsWorstCase>;
-        defm : LMULSEWWriteResMXSEW<"WriteVFMulAddV", [VCQ, VA], mx, sew, IsWorstCase>;
-        defm : LMULSEWWriteResMXSEW<"WriteVFMulAddF", [VCQ, VA], mx, sew, IsWorstCase>;
-        defm : LMULSEWWriteResMXSEW<"WriteVFRecpV",   [VCQ, VA1], mx, sew, IsWorstCase>;
-        defm : LMULSEWWriteResMXSEW<"WriteVFCvtIToFV", [VCQ, VA1], mx, sew, IsWorstCase>;
-      }
-      defvar Lat4 = !if(!and(isFP64Throttled, !eq(sew, 64)), Cycles, 4);
-      let Latency = Lat4, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in {
-        defm : LMULSEWWriteResMXSEW<"WriteVFSgnjV",   [VCQ, VA], mx, sew, IsWorstCase>;
-        defm : LMULSEWWriteResMXSEW<"WriteVFSgnjF",   [VCQ, VA], mx, sew, IsWorstCase>;
-        // min max require merge
-        defm : LMULSEWWriteResMXSEW<"WriteVFMinMaxV", [VCQ, VA1], mx, sew, IsWorstCase>;
-        defm : LMULSEWWriteResMXSEW<"WriteVFMinMaxF", [VCQ, VA1], mx, sew, IsWorstCase>;
+      if !eq(sew, 64) then {
+        defvar SingleElementCycles = SiFive7GetCyclesOnePerElement<mx, sew, VLEN>.c;
+        foreach SchedWriteName = ["WriteVFALUV", "WriteVFALUF", "WriteVFMulV", "WriteVFMulF",
+                                  "WriteVFMulAddV", "WriteVFMulAddF"] in
+        defm : LMULSEWWriteResMXSEWVariant<SchedWriteName, SingleElementVecFP64SchedPred,
+                                           // Predicated
+                                           [VCQ, VA1], !add(SingleElementCycles, 7), [0, 1], [1, !add(1, SingleElementCycles)],
+                                           // Not Predicated
+                                           [VCQ, VA1OrVA2], 8, [0, 1], [1, !add(1, SiFive7GetCyclesDefault<mx>.c)],
+                                           mx, sew, IsWorstCase>;
+        foreach SchedWriteName = ["WriteVFRecpV", "WriteVFCvtIToFV"] in
+        defm : LMULSEWWriteResMXSEWVariant<SchedWriteName, SingleElementVecFP64SchedPred,
+                                           // Predicated
+                                           [VCQ, VA1], !add(SingleElementCycles, 7), [0, 1], [1, !add(1, SingleElementCycles)],
+                                           // Not Predicated
+                                           [VCQ, VA1], 8, [0, 1], [1, !add(1, SiFive7GetCyclesDefault<mx>.c)],
+                                           mx, sew, IsWorstCase>;
+        foreach SchedWriteName = ["WriteVFSgnjV", "WriteVFSgnjF"] in
+        defm : LMULSEWWriteResMXSEWVariant<SchedWriteName, SingleElementVecFP64SchedPred,
+                                           // Predicated
+                                           [VCQ, VA1], !add(SingleElementCycles, 3), [0, 1], [1, !add(1, SingleElementCycles)],
+                                           // Not Predicated
+                                           [VCQ, VA1OrVA2], 4, [0, 1], [1, !add(1, SiFive7GetCyclesDefault<mx>.c)],
+                                           mx, sew, IsWorstCase>;
+        foreach SchedWriteName = ["WriteVFMinMaxV", "WriteVFMinMaxF"] in
+        defm : LMULSEWWriteResMXSEWVariant<SchedWriteName, SingleElementVecFP64SchedPred,
+                                           // Predicated
+                                           [VCQ, VA1], !add(SingleElementCycles, 3), [0, 1], [1, !add(1, SingleElementCycles)],
+                                           // Not Predicated
+                                           [VCQ, VA1], 4, [0, 1], [1, !add(1, SiFive7GetCyclesDefault<mx>.c)],
+                                           mx, sew, IsWorstCase>;
+      } else {
+        let Latency = 8, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, SiFive7GetCyclesDefault<mx>.c)] in {
+          defm : LMULSEWWriteResMXSEW<"WriteVFALUV",  [VCQ, VA1OrVA2], mx, sew, IsWorstCase>;
+          defm : LMULSEWWriteResMXSEW<"WriteVFALUF",  [VCQ, VA1OrVA2], mx, sew, IsWorstCase>;
+          defm : LMULSEWWriteResMXSEW<"WriteVFMulV",  [VCQ, VA1OrVA2], mx, sew, IsWorstCase>;
+          defm : LMULSEWWriteResMXSEW<"WriteVFMulF",  [VCQ, VA1OrVA2], mx, sew, IsWorstCase>;
+          defm : LMULSEWWriteResMXSEW<"WriteVFMulAddV", [VCQ, VA1OrVA2], mx, sew, IsWorstCase>;
+          defm : LMULSEWWriteResMXSEW<"WriteVFMulAddF", [VCQ, VA1OrVA2], mx, sew, IsWorstCase>;
+          defm : LMULSEWWriteResMXSEW<"WriteVFRecpV",   [VCQ, VA1], mx, sew, IsWorstCase>;
+          defm : LMULSEWWriteResMXSEW<"WriteVFCvtIToFV", [VCQ, VA1], mx, sew, IsWorstCase>;
+        }
+        let Latency = 4, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, SiFive7GetCyclesDefault<mx>.c)] in {
+          defm : LMULSEWWriteResMXSEW<"WriteVFSgnjV",   [VCQ, VA1OrVA2], mx, sew, IsWorstCase>;
+          defm : LMULSEWWriteResMXSEW<"WriteVFSgnjF",   [VCQ, VA1OrVA2], mx, sew, IsWorstCase>;
+          // min max require merge
+          defm : LMULSEWWriteResMXSEW<"WriteVFMinMaxV", [VCQ, VA1], mx, sew, IsWorstCase>;
+          defm : LMULSEWWriteResMXSEW<"WriteVFMinMaxF", [VCQ, VA1], mx, sew, IsWorstCase>;
+        }
       }
     }
   }
@@ -892,19 +918,28 @@ multiclass SiFive7WriteResBase<int VLEN,
   // Widening
   foreach mx = SchedMxListW in {
     foreach sew = SchedSEWSet<mx, isF=0, isWidening=1>.val in {
-      defvar Cycles = !if(!and(isFP64Throttled, !eq(sew, 32)),
-                          SiFive7GetCyclesOnePerElement<mx, sew, VLEN>.c,
-                          SiFive7GetCyclesDefault<mx>.c);
       defvar IsWorstCase = SiFive7IsWorstCaseMXSEW<mx, sew, SchedMxListW>.c;
-      let Latency = 8, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in
-      defm : LMULSEWWriteResMXSEW<"WriteVFWCvtIToFV", [VCQ, VA1], mx, sew, IsWorstCase>;
+      defvar DefaultCycles = SiFive7GetCyclesDefault<mx>.c;
+      if !eq(sew, 32) then {
+        defvar SingleElementCycles = SiFive7GetCyclesOnePerElement<mx, sew, VLEN>.c;
+        defm : LMULSEWWriteResMXSEWVariant<"WriteVFWCvtIToFV", SingleElementVecFP64SchedPred,
+                                           // Predicated
+                                           [VCQ, VA1], 8, [0, 1], [1, !add(1, SingleElementCycles)],
+                                           // Not Predicated
+                                           [VCQ, VA1], 8, [0, 1], [1, !add(1, DefaultCycles)],
+                                           mx, sew, IsWorstCase>;
+      } else {
+        let Latency = 8,
+            AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, DefaultCycles)] in
+        defm : LMULSEWWriteResMXSEW<"WriteVFWCvtIToFV", [VCQ, VA1], mx, sew, IsWorstCase>;
+      }
     }
   }
   foreach mx = SchedMxListFW in {
     foreach sew = SchedSEWSet<mx, isF=1, isWidening=1>.val in {
-      defvar Cycles = SiFive7GetCyclesDefault<mx>.c;
+      defvar DefaultCycles = SiFive7GetCyclesDefault<mx>.c;
       defvar IsWorstCase = SiFive7IsWorstCaseMXSEW<mx, sew, SchedMxListFW, isF=1>.c;
-      let Latency = 8, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in {
+      let Latency = 8, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, DefaultCycles)] in {
         defm : LMULSEWWriteResMXSEW<"WriteVFWALUV", [VCQ, VA1OrVA2], mx, sew, IsWorstCase>;
         defm : LMULSEWWriteResMXSEW<"WriteVFWALUF", [VCQ, VA1OrVA2], mx, sew, IsWorstCase>;
         defm : LMULSEWWriteResMXSEW<"WriteVFWMulV", [VCQ, VA1OrVA2], mx, sew, IsWorstCase>;
@@ -912,11 +947,19 @@ multiclass SiFive7WriteResBase<int VLEN,
         defm : LMULSEWWriteResMXSEW<"WriteVFWMulAddV", [VCQ, VA1OrVA2], mx, sew, IsWorstCase>;
         defm : LMULSEWWriteResMXSEW<"WriteVFWMulAddF", [VCQ, VA1OrVA2], mx, sew, IsWorstCase>;
       }
-      defvar CvtCycles = !if(!and(isFP64Throttled, !eq(sew, 32)),
-                          SiFive7GetCyclesOnePerElement<mx, sew, VLEN>.c,
-                          SiFive7GetCyclesDefault<mx>.c);
-      let Latency = 8, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, CvtCycles)] in
-      defm "" : LMULSEWWriteResMXSEW<"WriteVFWCvtFToFV", [VCQ, VA1], mx, sew, IsWorstCase>;
+      if !eq(sew, 32) then {
+        defvar SingleElementCycles = SiFive7GetCyclesOnePerElement<mx, sew, VLEN>.c;
+        defm : LMULSEWWriteResMXSEWVariant<"WriteVFWCvtFToFV", SingleElementVecFP64SchedPred,
+                                           // Predicated
+                                           [VCQ, VA1], 8, [0, 1], [1, !add(1, SingleElementCycles)],
+                                           // Not Predicated
+                                           [VCQ, VA1], 8, [0, 1], [1, !add(1, DefaultCycles)],
+                                           mx, sew, IsWorstCase>;
+      } else {
+        let Latency = 8,
+            AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, DefaultCycles)] in
+        defm : LMULSEWWriteResMXSEW<"WriteVFWCvtFToFV", [VCQ, VA1], mx, sew, IsWorstCase>;
+      }
     }
     defvar Cycles = SiFive7GetCyclesDefault<mx>.c;
     defvar IsWorstCase = SiFive7IsWorstCaseMX<mx, SchedMxListFW>.c;
@@ -933,13 +976,23 @@ multiclass SiFive7WriteResBase<int VLEN,
   }
   foreach mx = SchedMxListFW in {
     foreach sew = SchedSEWSet<mx, isF=1, isWidening=1>.val in {
-      defvar Cycles = !if(!and(isFP64Throttled, !eq(sew, 32)),
-                          SiFive7GetCyclesOnePerElement<mx, sew, VLEN>.c,
-                          SiFive7GetCyclesNarrowing<mx>.c);
       defvar IsWorstCase = SiFive7IsWorstCaseMXSEW<mx, sew, SchedMxListFW, isF=1>.c;
-      let Latency = 8, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in {
-        defm : LMULSEWWriteResMXSEW<"WriteVFNCvtIToFV", [VCQ, VA1], mx, sew, IsWorstCase>;
-        defm : LMULSEWWriteResMXSEW<"WriteVFNCvtFToFV", [VCQ, VA1], mx, sew, IsWorstCase>;
+      defvar DefaultCycles = SiFive7GetCyclesNarrowing<mx>.c;
+      if !eq(sew, 32) then {
+        defvar SingleElementCycles = SiFive7GetCyclesOnePerElement<mx, sew, VLEN>.c;
+        foreach SchedWriteName = ["WriteVFNCvtIToFV", "WriteVFNCvtFToFV"] in
+        defm : LMULSEWWriteResMXSEWVariant<SchedWriteName, SingleElementVecFP64SchedPred,
+                                           // Predicated
+                                           [VCQ, VA1], 8, [0, 1], [1, !add(1, SingleElementCycles)],
+                                           // Not Predicated
+                                           [VCQ, VA1], 8, [0, 1], [1, !add(1, DefaultCycles)],
+                                           mx, sew, IsWorstCase>;
+      } else {
+        let Latency = 8,
+            AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, DefaultCycles)] in {
+          defm : LMULSEWWriteResMXSEW<"WriteVFNCvtIToFV", [VCQ, VA1], mx, sew, IsWorstCase>;
+          defm : LMULSEWWriteResMXSEW<"WriteVFNCvtFToFV", [VCQ, VA1], mx, sew, IsWorstCase>;
+        }
       }
     }
   }
@@ -1499,7 +1552,6 @@ multiclass SiFive7ReadAdvance {
 /// eventually be supplied by different SchedMachineModels.
 multiclass SiFive7SchedResources<int vlen, bit extraVALU,
                                  SiFive7FPLatencies fpLatencies,
-                                 bit isFP64Throttled,
                                  bit hasFastGather> {
   defm SiFive7 : SiFive7ProcResources<extraVALU>;
 
@@ -1527,8 +1579,7 @@ multiclass SiFive7SchedResources<int vlen, bit extraVALU,
       : SiFive7WriteResBase<vlen, SiFive7PipeA, SiFive7PipeB, SiFive7PipeAB,
                             SiFive7IDiv, SiFive7FDiv, SiFive7VA1,
                             SiFive7VA1OrVA2, SiFive7VL, SiFive7VS,
-                            SiFive7VCQ, fpLatencies, isFP64Throttled,
-                            hasFastGather>;
+                            SiFive7VCQ, fpLatencies, hasFastGather>;
 
   //===----------------------------------------------------------------------===//
   // Bypass and advance
@@ -1560,7 +1611,6 @@ class SiFive7SchedMachineModel<int vlen> : SchedMachineModel {
   bit HasExtraVALU = false;
 
   SiFive7FPLatencies FPLatencies;
-  bit IsFP64Throttled = false;
   bit HasFastGather = false;
 
   string Name = !subst("Model", "", !subst("SiFive7", "", NAME));
@@ -1587,7 +1637,6 @@ def SiFive7VLEN512Model : SiFive7SchedMachineModel<512> {
 def SiFive7VLEN1024X300Model : SiFive7SchedMachineModel<1024> {
   let HasExtraVALU = true;
   let FPLatencies = SiFive7LowFPLatencies;
-  let IsFP64Throttled = true;
   let HasFastGather = true;
 }
 
@@ -1596,7 +1645,6 @@ foreach model = [SiFive7VLEN512Model, SiFive7VLEN1024X300Model] in {
   let SchedModel = model in
   defm model.Name : SiFive7SchedResources<model.VLEN, model.HasExtraVALU,
                                           model.FPLatencies,
-                                          model.IsFP64Throttled,
                                           model.HasFastGather>;
 }
 
diff --git a/llvm/lib/Target/RISCV/RISCVScheduleV.td b/llvm/lib/Target/RISCV/RISCVScheduleV.td
index 01a4308..d11b446 100644
--- a/llvm/lib/Target/RISCV/RISCVScheduleV.td
+++ b/llvm/lib/Target/RISCV/RISCVScheduleV.td
@@ -128,6 +128,22 @@ multiclass LMULWriteResMXVariant<string name, SchedPredicateBase Pred,
                                     IsWorstCase>;
 }
 
+multiclass LMULSEWWriteResMXSEWVariant<string name, SchedPredicateBase Pred,
+                                       list<ProcResourceKind> predResources,
+                                       int predLat, list<int> predAcquireCycles,
+                                       list<int> predReleaseCycles,
+                                       list<ProcResourceKind> noPredResources,
+                                       int noPredLat, list<int> noPredAcquireCycles,
+                                       list<int> noPredReleaseCycles,
+                                       string mx, int sew, bit IsWorstCase> {
+  defm "" : LMULWriteResVariantImpl<name, name # "_" # mx # "_E" # sew, Pred, predResources,
+                                    predLat, predAcquireCycles,
+                                    predReleaseCycles, noPredResources,
+                                    noPredLat, noPredAcquireCycles,
+                                    noPredReleaseCycles,
+                                    IsWorstCase>;
+}
+
 // Define multiclasses to define SchedWrite, SchedRead,  WriteRes, and
 // ReadAdvance for each (name, LMUL) pair and for each LMUL in each of the
 // SchedMxList variants above. Each multiclass is responsible for defining
diff --git a/llvm/lib/Target/X86/X86InstrAVX10.td b/llvm/lib/Target/X86/X86InstrAVX10.td
index 764ff998..4b3ddbd 100644
--- a/llvm/lib/Target/X86/X86InstrAVX10.td
+++ b/llvm/lib/Target/X86/X86InstrAVX10.td
@@ -592,10 +592,10 @@ def : Pat<(X86mcvttp2sis (v2f64 (X86VBroadcastld64 addr:$src)),
           (VCVTTPD2DQSZ128rmbkz VK2WM:$mask, addr:$src)>;
 
 // Patterns VCVTTPD2UDQSZ128
-def : Pat<(v4i32 (X86cvttp2uis (v2f64 (X86VBroadcastld64 addr:$src)))),
-          (VCVTTPD2UDQSZ128rmb addr:$src)>;
 def : Pat<(v4i32 (X86cvttp2uis (v2f64 VR128X:$src))),
           (VCVTTPD2UDQSZ128rr VR128X:$src)>;
+def : Pat<(v4i32 (X86cvttp2uis (loadv2f64 addr:$src))),
+          (VCVTTPD2UDQSZ128rm addr:$src)>;
 def : Pat<(v4i32 (X86cvttp2uis (v2f64 (X86VBroadcastld64 addr:$src)))),
           (VCVTTPD2UDQSZ128rmb addr:$src)>;
 def : Pat<(X86mcvttp2uis (v2f64 VR128X:$src), (v4i32 VR128X:$src0),
diff --git a/llvm/lib/Transforms/AggressiveInstCombine/TruncInstCombine.cpp b/llvm/lib/Transforms/AggressiveInstCombine/TruncInstCombine.cpp
index 9b9e2ba..9150b58 100644
--- a/llvm/lib/Transforms/AggressiveInstCombine/TruncInstCombine.cpp
+++ b/llvm/lib/Transforms/AggressiveInstCombine/TruncInstCombine.cpp
@@ -459,7 +459,7 @@ void TruncInstCombine::ReduceExpressionGraph(Type *SclTy) {
       Value *Op0 = I->getOperand(0);
       Value *LHS = getReducedOperand(I->getOperand(1), SclTy);
       Value *RHS = getReducedOperand(I->getOperand(2), SclTy);
-      Res = Builder.CreateSelect(Op0, LHS, RHS);
+      Res = Builder.CreateSelect(Op0, LHS, RHS, "", I);
       break;
     }
     case Instruction::PHI: {
diff --git a/llvm/lib/Transforms/Coroutines/CoroAnnotationElide.cpp b/llvm/lib/Transforms/Coroutines/CoroAnnotationElide.cpp
index 9115946..f166fef 100644
--- a/llvm/lib/Transforms/Coroutines/CoroAnnotationElide.cpp
+++ b/llvm/lib/Transforms/Coroutines/CoroAnnotationElide.cpp
@@ -24,6 +24,9 @@
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/PassManager.h"
+#include "llvm/Support/BranchProbability.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/FileSystem.h"
 #include "llvm/Transforms/Utils/CallGraphUpdater.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 
@@ -33,6 +36,11 @@ using namespace llvm;
 
 #define DEBUG_TYPE "coro-annotation-elide"
 
+static cl::opt<float> CoroElideBranchRatio(
+    "coro-elide-branch-ratio", cl::init(0.55), cl::Hidden,
+    cl::desc("Minimum BranchProbability to consider a elide a coroutine."));
+extern cl::opt<unsigned> MinBlockCounterExecution;
+
 static Instruction *getFirstNonAllocaInTheEntryBlock(Function *F) {
   for (Instruction &I : F->getEntryBlock())
     if (!isa<AllocaInst>(&I))
@@ -145,6 +153,30 @@ PreservedAnalyses CoroAnnotationElidePass::run(LazyCallGraph::SCC &C,
       bool IsCallerPresplitCoroutine = Caller->isPresplitCoroutine();
       bool HasAttr = CB->hasFnAttr(llvm::Attribute::CoroElideSafe);
       if (IsCallerPresplitCoroutine && HasAttr) {
+        BranchProbability MinBranchProbability(
+            static_cast<int>(CoroElideBranchRatio * MinBlockCounterExecution),
+            MinBlockCounterExecution);
+
+        auto &BFI = FAM.getResult<BlockFrequencyAnalysis>(*Caller);
+
+        auto Prob = BranchProbability::getBranchProbability(
+            BFI.getBlockFreq(CB->getParent()).getFrequency(),
+            BFI.getEntryFreq().getFrequency());
+
+        if (Prob < MinBranchProbability) {
+          ORE.emit([&]() {
+            return OptimizationRemarkMissed(
+                       DEBUG_TYPE, "CoroAnnotationElideUnlikely", Caller)
+                   << "'" << ore::NV("callee", Callee->getName())
+                   << "' not elided in '"
+                   << ore::NV("caller", Caller->getName())
+                   << "' because of low probability: "
+                   << ore::NV("probability", Prob) << " (threshold: "
+                   << ore::NV("threshold", MinBranchProbability) << ")";
+          });
+          continue;
+        }
+
         auto *CallerN = CG.lookup(*Caller);
         auto *CallerC = CallerN ? CG.lookupSCC(*CallerN) : nullptr;
         // If CallerC is nullptr, it means LazyCallGraph hasn't visited Caller
@@ -156,7 +188,7 @@ PreservedAnalyses CoroAnnotationElidePass::run(LazyCallGraph::SCC &C,
           return OptimizationRemark(DEBUG_TYPE, "CoroAnnotationElide", Caller)
                  << "'" << ore::NV("callee", Callee->getName())
                  << "' elided in '" << ore::NV("caller", Caller->getName())
-                 << "'";
+                 << "' (probability: " << ore::NV("probability", Prob) << ")";
         });
 
         FAM.invalidate(*Caller, PreservedAnalyses::none());
diff --git a/llvm/lib/Transforms/IPO/PartialInlining.cpp b/llvm/lib/Transforms/IPO/PartialInlining.cpp
index 2583249..1a00d17 100644
--- a/llvm/lib/Transforms/IPO/PartialInlining.cpp
+++ b/llvm/lib/Transforms/IPO/PartialInlining.cpp
@@ -109,7 +109,7 @@ static cl::opt<float> MinRegionSizeRatio(
              "outline candidate and original function"));
 // Used to tune the minimum number of execution counts needed in the predecessor
 // block to the cold edge. ie. confidence interval.
-static cl::opt<unsigned>
+cl::opt<unsigned>
     MinBlockCounterExecution("min-block-execution", cl::init(100), cl::Hidden,
                              cl::desc("Minimum block executions to consider "
                                       "its BranchProbabilityInfo valid"));
diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index 155fcc5..9ac3be1 100644
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -5959,7 +5959,11 @@ bool SimplifyCFGOpt::turnSwitchRangeIntoICmp(SwitchInst *SI,
     unsigned PreviousEdges = OtherCases->size();
     if (OtherDest == SI->getDefaultDest())
       ++PreviousEdges;
-    for (unsigned I = 0, E = PreviousEdges - 1; I != E; ++I)
+    unsigned E = PreviousEdges - 1;
+    // Remove all incoming values from OtherDest if OtherDest is unreachable.
+    if (NewBI->isUnconditional())
+      ++E;
+    for (unsigned I = 0; I != E; ++I)
       cast<PHINode>(BBI)->removeIncomingValue(SI->getParent());
   }
 
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 3f16b03..e62d57e 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -5696,7 +5696,7 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
     Instruction *I = Worklist.pop_back_val();
     for (auto &Op : I->operands())
       if (auto *InstOp = dyn_cast<Instruction>(Op))
-        if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) &&
+        if (TheLoop->contains(InstOp) && !isa<PHINode>(InstOp) &&
             AddrDefs.insert(InstOp).second)
           Worklist.push_back(InstOp);
   }
diff --git a/llvm/test/Analysis/IR2Vec/unreachable.ll b/llvm/test/Analysis/IR2Vec/unreachable.ll
index 9be0ee1..627e2c9 100644
--- a/llvm/test/Analysis/IR2Vec/unreachable.ll
+++ b/llvm/test/Analysis/IR2Vec/unreachable.ll
@@ -30,13 +30,17 @@ return:                                           ; preds = %if.else, %if.then
   %4 = load i32, ptr %retval, align 4
   ret i32 %4
 }
-
-; CHECK: Basic block vectors:
+; We'll get individual basic block embeddings for all blocks in the function.
+; But unreachable blocks are not counted for computing the function embedding.
+; CHECK: Function vector:  [ 1301.20  1318.20  1335.20 ]
+; CHECK-NEXT: Basic block vectors:
 ; CHECK-NEXT: Basic block: entry:
 ; CHECK-NEXT: [ 816.20  825.20  834.20 ]
 ; CHECK-NEXT: Basic block: if.then:
 ; CHECK-NEXT: [ 195.00  198.00  201.00 ]
 ; CHECK-NEXT: Basic block: if.else:
 ; CHECK-NEXT: [ 195.00  198.00  201.00 ]
+; CHECK-NEXT: Basic block: unreachable:
+; CHECK-NEXT:  [ 101.00  103.00  105.00 ]
 ; CHECK-NEXT: Basic block: return:
 ; CHECK-NEXT: [ 95.00  97.00  99.00 ]
diff --git a/llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll b/llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll
index 4287507..dfff35d 100644
--- a/llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll
+++ b/llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll
@@ -1451,3 +1451,52 @@ define <4 x i32> @partial_reduce_shl_zext_non_const_rhs(<16 x i8> %l, <4 x i32>
   %red = tail call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> %part, <16 x i32> %shift)
   ret <4 x i32> %red
 }
+
+define <2 x i32> @udot_v16i8tov2i32(<2 x i32> %acc, <16 x i8> %input) {
+; CHECK-NODOT-LABEL: udot_v16i8tov2i32:
+; CHECK-NODOT:       // %bb.0: // %entry
+; CHECK-NODOT-NEXT:    ushll v2.8h, v1.8b, #0
+; CHECK-NODOT-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NODOT-NEXT:    ushll2 v1.8h, v1.16b, #0
+; CHECK-NODOT-NEXT:    ushll v3.4s, v2.4h, #0
+; CHECK-NODOT-NEXT:    uaddw v0.4s, v0.4s, v2.4h
+; CHECK-NODOT-NEXT:    ushll2 v4.4s, v2.8h, #0
+; CHECK-NODOT-NEXT:    ext v2.16b, v2.16b, v2.16b, #8
+; CHECK-NODOT-NEXT:    ext v3.16b, v3.16b, v3.16b, #8
+; CHECK-NODOT-NEXT:    add v0.2s, v3.2s, v0.2s
+; CHECK-NODOT-NEXT:    ext v3.16b, v4.16b, v4.16b, #8
+; CHECK-NODOT-NEXT:    uaddw v0.4s, v0.4s, v2.4h
+; CHECK-NODOT-NEXT:    ushll v2.4s, v1.4h, #0
+; CHECK-NODOT-NEXT:    add v0.2s, v3.2s, v0.2s
+; CHECK-NODOT-NEXT:    ext v2.16b, v2.16b, v2.16b, #8
+; CHECK-NODOT-NEXT:    ushll2 v3.4s, v1.8h, #0
+; CHECK-NODOT-NEXT:    uaddw v0.4s, v0.4s, v1.4h
+; CHECK-NODOT-NEXT:    ext v1.16b, v1.16b, v1.16b, #8
+; CHECK-NODOT-NEXT:    add v0.2s, v2.2s, v0.2s
+; CHECK-NODOT-NEXT:    ext v2.16b, v3.16b, v3.16b, #8
+; CHECK-NODOT-NEXT:    uaddw v0.4s, v0.4s, v1.4h
+; CHECK-NODOT-NEXT:    add v0.2s, v2.2s, v0.2s
+; CHECK-NODOT-NEXT:    ret
+;
+; CHECK-DOT-LABEL: udot_v16i8tov2i32:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    movi v2.16b, #1
+; CHECK-DOT-NEXT:    fmov d0, d0
+; CHECK-DOT-NEXT:    udot v0.4s, v1.16b, v2.16b
+; CHECK-DOT-NEXT:    addp v0.4s, v0.4s, v0.4s
+; CHECK-DOT-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-DOT-I8MM-LABEL: udot_v16i8tov2i32:
+; CHECK-DOT-I8MM:       // %bb.0: // %entry
+; CHECK-DOT-I8MM-NEXT:    movi v2.16b, #1
+; CHECK-DOT-I8MM-NEXT:    fmov d0, d0
+; CHECK-DOT-I8MM-NEXT:    udot v0.4s, v1.16b, v2.16b
+; CHECK-DOT-I8MM-NEXT:    addp v0.4s, v0.4s, v0.4s
+; CHECK-DOT-I8MM-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-DOT-I8MM-NEXT:    ret
+entry:
+    %input.wide = zext <16 x i8> %input to <16 x i32>
+    %partial.reduce = tail call <2 x i32> @llvm.vector.partial.reduce.add(<2 x i32> %acc, <16 x i32> %input.wide)
+    ret <2 x i32> %partial.reduce
+}
diff --git a/llvm/test/CodeGen/NVPTX/i32x2-instructions.ll b/llvm/test/CodeGen/NVPTX/i32x2-instructions.ll
new file mode 100644
index 0000000..153ca10
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/i32x2-instructions.ll
@@ -0,0 +1,1625 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc < %s -mcpu=sm_80 -O0 -disable-post-ra -frame-pointer=all            \
+; RUN: -verify-machineinstrs | FileCheck --check-prefixes=CHECK,CHECK-NOI32X2 %s
+; RUN: %if ptxas-sm_80 %{                                                       \
+; RUN:  llc < %s -mcpu=sm_80 -O0 -disable-post-ra -frame-pointer=all           \
+; RUN:  -verify-machineinstrs | %ptxas-verify -arch=sm_80                      \
+; RUN: %}
+; RUN: llc < %s -mcpu=sm_100 -O0 -disable-post-ra -frame-pointer=all           \
+; RUN: -verify-machineinstrs | FileCheck --check-prefixes=CHECK,CHECK-I32X2 %s
+; RUN: %if ptxas-sm_100 %{                                                       \
+; RUN:  llc < %s -mcpu=sm_100 -O0 -disable-post-ra -frame-pointer=all          \
+; RUN:  -verify-machineinstrs | %ptxas-verify -arch=sm_100                     \
+; RUN: %}
+
+target triple = "nvptx64-nvidia-cuda"
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+
+define <2 x i32> @test_ret_const() #0 {
+; CHECK-LABEL: test_ret_const(
+; CHECK:       {
+; CHECK-EMPTY:
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {-1, 2};
+; CHECK-NEXT:    ret;
+  ret <2 x i32> <i32 -1, i32 2>
+}
+
+define i32 @test_extract_0(<2 x i32> %a) #0 {
+; CHECK-NOI32X2-LABEL: test_extract_0(
+; CHECK-NOI32X2:       {
+; CHECK-NOI32X2-NEXT:    .reg .b32 %r<3>;
+; CHECK-NOI32X2-EMPTY:
+; CHECK-NOI32X2-NEXT:  // %bb.0:
+; CHECK-NOI32X2-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_extract_0_param_0];
+; CHECK-NOI32X2-NEXT:    st.param.b32 [func_retval0], %r1;
+; CHECK-NOI32X2-NEXT:    ret;
+;
+; CHECK-I32X2-LABEL: test_extract_0(
+; CHECK-I32X2:       {
+; CHECK-I32X2-NEXT:    .reg .b32 %r<2>;
+; CHECK-I32X2-NEXT:    .reg .b64 %rd<2>;
+; CHECK-I32X2-EMPTY:
+; CHECK-I32X2-NEXT:  // %bb.0:
+; CHECK-I32X2-NEXT:    ld.param.b64 %rd1, [test_extract_0_param_0];
+; CHECK-I32X2-NEXT:    mov.b64 {%r1, _}, %rd1;
+; CHECK-I32X2-NEXT:    st.param.b32 [func_retval0], %r1;
+; CHECK-I32X2-NEXT:    ret;
+  %e = extractelement <2 x i32> %a, i32 0
+  ret i32 %e
+}
+
+define i32 @test_extract_1(<2 x i32> %a) #0 {
+; CHECK-NOI32X2-LABEL: test_extract_1(
+; CHECK-NOI32X2:       {
+; CHECK-NOI32X2-NEXT:    .reg .b32 %r<3>;
+; CHECK-NOI32X2-EMPTY:
+; CHECK-NOI32X2-NEXT:  // %bb.0:
+; CHECK-NOI32X2-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_extract_1_param_0];
+; CHECK-NOI32X2-NEXT:    st.param.b32 [func_retval0], %r2;
+; CHECK-NOI32X2-NEXT:    ret;
+;
+; CHECK-I32X2-LABEL: test_extract_1(
+; CHECK-I32X2:       {
+; CHECK-I32X2-NEXT:    .reg .b32 %r<2>;
+; CHECK-I32X2-NEXT:    .reg .b64 %rd<2>;
+; CHECK-I32X2-EMPTY:
+; CHECK-I32X2-NEXT:  // %bb.0:
+; CHECK-I32X2-NEXT:    ld.param.b64 %rd1, [test_extract_1_param_0];
+; CHECK-I32X2-NEXT:    mov.b64 {_, %r1}, %rd1;
+; CHECK-I32X2-NEXT:    st.param.b32 [func_retval0], %r1;
+; CHECK-I32X2-NEXT:    ret;
+  %e = extractelement <2 x i32> %a, i32 1
+  ret i32 %e
+}
+
+define i32 @test_extract_i(<2 x i32> %a, i64 %idx) #0 {
+; CHECK-NOI32X2-LABEL: test_extract_i(
+; CHECK-NOI32X2:       {
+; CHECK-NOI32X2-NEXT:    .local .align 8 .b8 __local_depot3[8];
+; CHECK-NOI32X2-NEXT:    .reg .b64 %SP;
+; CHECK-NOI32X2-NEXT:    .reg .b64 %SPL;
+; CHECK-NOI32X2-NEXT:    .reg .b32 %r<4>;
+; CHECK-NOI32X2-NEXT:    .reg .b64 %rd<6>;
+; CHECK-NOI32X2-EMPTY:
+; CHECK-NOI32X2-NEXT:  // %bb.0:
+; CHECK-NOI32X2-NEXT:    mov.b64 %SPL, __local_depot3;
+; CHECK-NOI32X2-NEXT:    cvta.local.u64 %SP, %SPL;
+; CHECK-NOI32X2-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_extract_i_param_0];
+; CHECK-NOI32X2-NEXT:    ld.param.b64 %rd1, [test_extract_i_param_1];
+; CHECK-NOI32X2-NEXT:    st.v2.b32 [%SP], {%r1, %r2};
+; CHECK-NOI32X2-NEXT:    and.b64 %rd2, %rd1, 1;
+; CHECK-NOI32X2-NEXT:    shl.b64 %rd3, %rd2, 2;
+; CHECK-NOI32X2-NEXT:    add.u64 %rd4, %SP, 0;
+; CHECK-NOI32X2-NEXT:    or.b64 %rd5, %rd4, %rd3;
+; CHECK-NOI32X2-NEXT:    ld.b32 %r3, [%rd5];
+; CHECK-NOI32X2-NEXT:    st.param.b32 [func_retval0], %r3;
+; CHECK-NOI32X2-NEXT:    ret;
+;
+; CHECK-I32X2-LABEL: test_extract_i(
+; CHECK-I32X2:       {
+; CHECK-I32X2-NEXT:    .reg .pred %p<2>;
+; CHECK-I32X2-NEXT:    .reg .b32 %r<4>;
+; CHECK-I32X2-NEXT:    .reg .b64 %rd<3>;
+; CHECK-I32X2-EMPTY:
+; CHECK-I32X2-NEXT:  // %bb.0:
+; CHECK-I32X2-NEXT:    ld.param.b64 %rd2, [test_extract_i_param_1];
+; CHECK-I32X2-NEXT:    ld.param.b64 %rd1, [test_extract_i_param_0];
+; CHECK-I32X2-NEXT:    setp.eq.b64 %p1, %rd2, 0;
+; CHECK-I32X2-NEXT:    mov.b64 {%r1, %r2}, %rd1;
+; CHECK-I32X2-NEXT:    selp.b32 %r3, %r1, %r2, %p1;
+; CHECK-I32X2-NEXT:    st.param.b32 [func_retval0], %r3;
+; CHECK-I32X2-NEXT:    ret;
+  %e = extractelement <2 x i32> %a, i64 %idx
+  ret i32 %e
+}
+
+define <2 x i32> @test_add(<2 x i32> %a, <2 x i32> %b) #0 {
+; CHECK-NOI32X2-LABEL: test_add(
+; CHECK-NOI32X2:       {
+; CHECK-NOI32X2-NEXT:    .reg .b32 %r<7>;
+; CHECK-NOI32X2-EMPTY:
+; CHECK-NOI32X2-NEXT:  // %bb.0:
+; CHECK-NOI32X2-NEXT:    ld.param.v2.b32 {%r3, %r4}, [test_add_param_1];
+; CHECK-NOI32X2-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_add_param_0];
+; CHECK-NOI32X2-NEXT:    add.s32 %r5, %r2, %r4;
+; CHECK-NOI32X2-NEXT:    add.s32 %r6, %r1, %r3;
+; CHECK-NOI32X2-NEXT:    st.param.v2.b32 [func_retval0], {%r6, %r5};
+; CHECK-NOI32X2-NEXT:    ret;
+;
+; CHECK-I32X2-LABEL: test_add(
+; CHECK-I32X2:       {
+; CHECK-I32X2-NEXT:    .reg .b32 %r<7>;
+; CHECK-I32X2-NEXT:    .reg .b64 %rd<3>;
+; CHECK-I32X2-EMPTY:
+; CHECK-I32X2-NEXT:  // %bb.0:
+; CHECK-I32X2-NEXT:    ld.param.b64 %rd2, [test_add_param_1];
+; CHECK-I32X2-NEXT:    ld.param.b64 %rd1, [test_add_param_0];
+; CHECK-I32X2-NEXT:    mov.b64 {%r1, %r2}, %rd2;
+; CHECK-I32X2-NEXT:    mov.b64 {%r3, %r4}, %rd1;
+; CHECK-I32X2-NEXT:    add.s32 %r5, %r4, %r2;
+; CHECK-I32X2-NEXT:    add.s32 %r6, %r3, %r1;
+; CHECK-I32X2-NEXT:    st.param.v2.b32 [func_retval0], {%r6, %r5};
+; CHECK-I32X2-NEXT:    ret;
+  %r = add <2 x i32> %a, %b
+  ret <2 x i32> %r
+}
+
+define <2 x i32> @test_add_imm_0(<2 x i32> %a) #0 {
+; CHECK-NOI32X2-LABEL: test_add_imm_0(
+; CHECK-NOI32X2:       {
+; CHECK-NOI32X2-NEXT:    .reg .b32 %r<5>;
+; CHECK-NOI32X2-EMPTY:
+; CHECK-NOI32X2-NEXT:  // %bb.0:
+; CHECK-NOI32X2-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_add_imm_0_param_0];
+; CHECK-NOI32X2-NEXT:    add.s32 %r3, %r2, 2;
+; CHECK-NOI32X2-NEXT:    add.s32 %r4, %r1, 1;
+; CHECK-NOI32X2-NEXT:    st.param.v2.b32 [func_retval0], {%r4, %r3};
+; CHECK-NOI32X2-NEXT:    ret;
+;
+; CHECK-I32X2-LABEL: test_add_imm_0(
+; CHECK-I32X2:       {
+; CHECK-I32X2-NEXT:    .reg .b32 %r<5>;
+; CHECK-I32X2-NEXT:    .reg .b64 %rd<2>;
+; CHECK-I32X2-EMPTY:
+; CHECK-I32X2-NEXT:  // %bb.0:
+; CHECK-I32X2-NEXT:    ld.param.b64 %rd1, [test_add_imm_0_param_0];
+; CHECK-I32X2-NEXT:    mov.b64 {%r1, %r2}, %rd1;
+; CHECK-I32X2-NEXT:    add.s32 %r3, %r2, 2;
+; CHECK-I32X2-NEXT:    add.s32 %r4, %r1, 1;
+; CHECK-I32X2-NEXT:    st.param.v2.b32 [func_retval0], {%r4, %r3};
+; CHECK-I32X2-NEXT:    ret;
+  %r = add <2 x i32> <i32 1, i32 2>, %a
+  ret <2 x i32> %r
+}
+
+define <2 x i32> @test_add_imm_1(<2 x i32> %a) #0 {
+; CHECK-NOI32X2-LABEL: test_add_imm_1(
+; CHECK-NOI32X2:       {
+; CHECK-NOI32X2-NEXT:    .reg .b32 %r<5>;
+; CHECK-NOI32X2-EMPTY:
+; CHECK-NOI32X2-NEXT:  // %bb.0:
+; CHECK-NOI32X2-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_add_imm_1_param_0];
+; CHECK-NOI32X2-NEXT:    add.s32 %r3, %r2, 2;
+; CHECK-NOI32X2-NEXT:    add.s32 %r4, %r1, 1;
+; CHECK-NOI32X2-NEXT:    st.param.v2.b32 [func_retval0], {%r4, %r3};
+; CHECK-NOI32X2-NEXT:    ret;
+;
+; CHECK-I32X2-LABEL: test_add_imm_1(
+; CHECK-I32X2:       {
+; CHECK-I32X2-NEXT:    .reg .b32 %r<5>;
+; CHECK-I32X2-NEXT:    .reg .b64 %rd<2>;
+; CHECK-I32X2-EMPTY:
+; CHECK-I32X2-NEXT:  // %bb.0:
+; CHECK-I32X2-NEXT:    ld.param.b64 %rd1, [test_add_imm_1_param_0];
+; CHECK-I32X2-NEXT:    mov.b64 {%r1, %r2}, %rd1;
+; CHECK-I32X2-NEXT:    add.s32 %r3, %r2, 2;
+; CHECK-I32X2-NEXT:    add.s32 %r4, %r1, 1;
+; CHECK-I32X2-NEXT:    st.param.v2.b32 [func_retval0], {%r4, %r3};
+; CHECK-I32X2-NEXT:    ret;
+  %r = add <2 x i32> %a, <i32 1, i32 2>
+  ret <2 x i32> %r
+}
+
+define <2 x i32> @test_sub(<2 x i32> %a, <2 x i32> %b) #0 {
+; CHECK-NOI32X2-LABEL: test_sub(
+; CHECK-NOI32X2:       {
+; CHECK-NOI32X2-NEXT:    .reg .b32 %r<7>;
+; CHECK-NOI32X2-EMPTY:
+; CHECK-NOI32X2-NEXT:  // %bb.0:
+; CHECK-NOI32X2-NEXT:    ld.param.v2.b32 {%r3, %r4}, [test_sub_param_1];
+; CHECK-NOI32X2-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_sub_param_0];
+; CHECK-NOI32X2-NEXT:    sub.s32 %r5, %r2, %r4;
+; CHECK-NOI32X2-NEXT:    sub.s32 %r6, %r1, %r3;
+; CHECK-NOI32X2-NEXT:    st.param.v2.b32 [func_retval0], {%r6, %r5};
+; CHECK-NOI32X2-NEXT:    ret;
+;
+; CHECK-I32X2-LABEL: test_sub(
+; CHECK-I32X2:       {
+; CHECK-I32X2-NEXT:    .reg .b32 %r<7>;
+; CHECK-I32X2-NEXT:    .reg .b64 %rd<3>;
+; CHECK-I32X2-EMPTY:
+; CHECK-I32X2-NEXT:  // %bb.0:
+; CHECK-I32X2-NEXT:    ld.param.b64 %rd2, [test_sub_param_1];
+; CHECK-I32X2-NEXT:    ld.param.b64 %rd1, [test_sub_param_0];
+; CHECK-I32X2-NEXT:    mov.b64 {%r1, %r2}, %rd2;
+; CHECK-I32X2-NEXT:    mov.b64 {%r3, %r4}, %rd1;
+; CHECK-I32X2-NEXT:    sub.s32 %r5, %r4, %r2;
+; CHECK-I32X2-NEXT:    sub.s32 %r6, %r3, %r1;
+; CHECK-I32X2-NEXT:    st.param.v2.b32 [func_retval0], {%r6, %r5};
+; CHECK-I32X2-NEXT:    ret;
+  %r = sub <2 x i32> %a, %b
+  ret <2 x i32> %r
+}
+
+define <2 x i32> @test_smax(<2 x i32> %a, <2 x i32> %b) #0 {
+; CHECK-NOI32X2-LABEL: test_smax(
+; CHECK-NOI32X2:       {
+; CHECK-NOI32X2-NEXT:    .reg .b32 %r<7>;
+; CHECK-NOI32X2-EMPTY:
+; CHECK-NOI32X2-NEXT:  // %bb.0:
+; CHECK-NOI32X2-NEXT:    ld.param.v2.b32 {%r3, %r4}, [test_smax_param_1];
+; CHECK-NOI32X2-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_smax_param_0];
+; CHECK-NOI32X2-NEXT:    max.s32 %r5, %r2, %r4;
+; CHECK-NOI32X2-NEXT:    max.s32 %r6, %r1, %r3;
+; CHECK-NOI32X2-NEXT:    st.param.v2.b32 [func_retval0], {%r6, %r5};
+; CHECK-NOI32X2-NEXT:    ret;
+;
+; CHECK-I32X2-LABEL: test_smax(
+; CHECK-I32X2:       {
+; CHECK-I32X2-NEXT:    .reg .b32 %r<7>;
+; CHECK-I32X2-NEXT:    .reg .b64 %rd<3>;
+; CHECK-I32X2-EMPTY:
+; CHECK-I32X2-NEXT:  // %bb.0:
+; CHECK-I32X2-NEXT:    ld.param.b64 %rd2, [test_smax_param_1];
+; CHECK-I32X2-NEXT:    ld.param.b64 %rd1, [test_smax_param_0];
+; CHECK-I32X2-NEXT:    mov.b64 {%r1, %r2}, %rd2;
+; CHECK-I32X2-NEXT:    mov.b64 {%r3, %r4}, %rd1;
+; CHECK-I32X2-NEXT:    max.s32 %r5, %r4, %r2;
+; CHECK-I32X2-NEXT:    max.s32 %r6, %r3, %r1;
+; CHECK-I32X2-NEXT:    st.param.v2.b32 [func_retval0], {%r6, %r5};
+; CHECK-I32X2-NEXT:    ret;
+  %cmp = icmp sgt <2 x i32> %a, %b
+  %r = select <2 x i1> %cmp, <2 x i32> %a, <2 x i32> %b
+  ret <2 x i32> %r
+}
+
+define <2 x i32> @test_umax(<2 x i32> %a, <2 x i32> %b) #0 {
+; CHECK-NOI32X2-LABEL: test_umax(
+; CHECK-NOI32X2:       {
+; CHECK-NOI32X2-NEXT:    .reg .b32 %r<7>;
+; CHECK-NOI32X2-EMPTY:
+; CHECK-NOI32X2-NEXT:  // %bb.0:
+; CHECK-NOI32X2-NEXT:    ld.param.v2.b32 {%r3, %r4}, [test_umax_param_1];
+; CHECK-NOI32X2-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_umax_param_0];
+; CHECK-NOI32X2-NEXT:    max.u32 %r5, %r2, %r4;
+; CHECK-NOI32X2-NEXT:    max.u32 %r6, %r1, %r3;
+; CHECK-NOI32X2-NEXT:    st.param.v2.b32 [func_retval0], {%r6, %r5};
+; CHECK-NOI32X2-NEXT:    ret;
+;
+; CHECK-I32X2-LABEL: test_umax(
+; CHECK-I32X2:       {
+; CHECK-I32X2-NEXT:    .reg .b32 %r<7>;
+; CHECK-I32X2-NEXT:    .reg .b64 %rd<3>;
+; CHECK-I32X2-EMPTY:
+; CHECK-I32X2-NEXT:  // %bb.0:
+; CHECK-I32X2-NEXT:    ld.param.b64 %rd2, [test_umax_param_1];
+; CHECK-I32X2-NEXT:    ld.param.b64 %rd1, [test_umax_param_0];
+; CHECK-I32X2-NEXT:    mov.b64 {%r1, %r2}, %rd2;
+; CHECK-I32X2-NEXT:    mov.b64 {%r3, %r4}, %rd1;
+; CHECK-I32X2-NEXT:    max.u32 %r5, %r4, %r2;
+; CHECK-I32X2-NEXT:    max.u32 %r6, %r3, %r1;
+; CHECK-I32X2-NEXT:    st.param.v2.b32 [func_retval0], {%r6, %r5};
+; CHECK-I32X2-NEXT:    ret;
+  %cmp = icmp ugt <2 x i32> %a, %b
+  %r = select <2 x i1> %cmp, <2 x i32> %a, <2 x i32> %b
+  ret <2 x i32> %r
+}
+
+define <2 x i32> @test_smin(<2 x i32> %a, <2 x i32> %b) #0 {
+; CHECK-NOI32X2-LABEL: test_smin(
+; CHECK-NOI32X2:       {
+; CHECK-NOI32X2-NEXT:    .reg .b32 %r<7>;
+; CHECK-NOI32X2-EMPTY:
+; CHECK-NOI32X2-NEXT:  // %bb.0:
+; CHECK-NOI32X2-NEXT:    ld.param.v2.b32 {%r3, %r4}, [test_smin_param_1];
+; CHECK-NOI32X2-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_smin_param_0];
+; CHECK-NOI32X2-NEXT:    min.s32 %r5, %r2, %r4;
+; CHECK-NOI32X2-NEXT:    min.s32 %r6, %r1, %r3;
+; CHECK-NOI32X2-NEXT:    st.param.v2.b32 [func_retval0], {%r6, %r5};
+; CHECK-NOI32X2-NEXT:    ret;
+;
+; CHECK-I32X2-LABEL: test_smin(
+; CHECK-I32X2:       {
+; CHECK-I32X2-NEXT:    .reg .b32 %r<7>;
+; CHECK-I32X2-NEXT:    .reg .b64 %rd<3>;
+; CHECK-I32X2-EMPTY:
+; CHECK-I32X2-NEXT:  // %bb.0:
+; CHECK-I32X2-NEXT:    ld.param.b64 %rd2, [test_smin_param_1];
+; CHECK-I32X2-NEXT:    ld.param.b64 %rd1, [test_smin_param_0];
+; CHECK-I32X2-NEXT:    mov.b64 {%r1, %r2}, %rd2;
+; CHECK-I32X2-NEXT:    mov.b64 {%r3, %r4}, %rd1;
+; CHECK-I32X2-NEXT:    min.s32 %r5, %r4, %r2;
+; CHECK-I32X2-NEXT:    min.s32 %r6, %r3, %r1;
+; CHECK-I32X2-NEXT:    st.param.v2.b32 [func_retval0], {%r6, %r5};
+; CHECK-I32X2-NEXT:    ret;
+  %cmp = icmp sle <2 x i32> %a, %b
+  %r = select <2 x i1> %cmp, <2 x i32> %a, <2 x i32> %b
+  ret <2 x i32> %r
+}
+
+define <2 x i32> @test_umin(<2 x i32> %a, <2 x i32> %b) #0 {
+; CHECK-NOI32X2-LABEL: test_umin(
+; CHECK-NOI32X2:       {
+; CHECK-NOI32X2-NEXT:    .reg .b32 %r<7>;
+; CHECK-NOI32X2-EMPTY:
+; CHECK-NOI32X2-NEXT:  // %bb.0:
+; CHECK-NOI32X2-NEXT:    ld.param.v2.b32 {%r3, %r4}, [test_umin_param_1];
+; CHECK-NOI32X2-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_umin_param_0];
+; CHECK-NOI32X2-NEXT:    min.u32 %r5, %r2, %r4;
+; CHECK-NOI32X2-NEXT:    min.u32 %r6, %r1, %r3;
+; CHECK-NOI32X2-NEXT:    st.param.v2.b32 [func_retval0], {%r6, %r5};
+; CHECK-NOI32X2-NEXT:    ret;
+;
+; CHECK-I32X2-LABEL: test_umin(
+; CHECK-I32X2:       {
+; CHECK-I32X2-NEXT:    .reg .b32 %r<7>;
+; CHECK-I32X2-NEXT:    .reg .b64 %rd<3>;
+; CHECK-I32X2-EMPTY:
+; CHECK-I32X2-NEXT:  // %bb.0:
+; CHECK-I32X2-NEXT:    ld.param.b64 %rd2, [test_umin_param_1];
+; CHECK-I32X2-NEXT:    ld.param.b64 %rd1, [test_umin_param_0];
+; CHECK-I32X2-NEXT:    mov.b64 {%r1, %r2}, %rd2;
+; CHECK-I32X2-NEXT:    mov.b64 {%r3, %r4}, %rd1;
+; CHECK-I32X2-NEXT:    min.u32 %r5, %r4, %r2;
+; CHECK-I32X2-NEXT:    min.u32 %r6, %r3, %r1;
+; CHECK-I32X2-NEXT:    st.param.v2.b32 [func_retval0], {%r6, %r5};
+; CHECK-I32X2-NEXT:    ret;
+  %cmp = icmp ule <2 x i32> %a, %b
+  %r = select <2 x i1> %cmp, <2 x i32> %a, <2 x i32> %b
+  ret <2 x i32> %r
+}
+
+define <2 x i32> @test_eq(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) #0 {
+; CHECK-NOI32X2-LABEL: test_eq(
+; CHECK-NOI32X2:       {
+; CHECK-NOI32X2-NEXT:    .reg .pred %p<3>;
+; CHECK-NOI32X2-NEXT:    .reg .b32 %r<9>;
+; CHECK-NOI32X2-EMPTY:
+; CHECK-NOI32X2-NEXT:  // %bb.0:
+; CHECK-NOI32X2-NEXT:    ld.param.v2.b32 {%r5, %r6}, [test_eq_param_2];
+; CHECK-NOI32X2-NEXT:    ld.param.v2.b32 {%r3, %r4}, [test_eq_param_1];
+; CHECK-NOI32X2-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_eq_param_0];
+; CHECK-NOI32X2-NEXT:    setp.eq.b32 %p1, %r1, %r3;
+; CHECK-NOI32X2-NEXT:    setp.eq.b32 %p2, %r2, %r4;
+; CHECK-NOI32X2-NEXT:    selp.b32 %r7, %r2, %r6, %p2;
+; CHECK-NOI32X2-NEXT:    selp.b32 %r8, %r1, %r5, %p1;
+; CHECK-NOI32X2-NEXT:    st.param.v2.b32 [func_retval0], {%r8, %r7};
+; CHECK-NOI32X2-NEXT:    ret;
+;
+; CHECK-I32X2-LABEL: test_eq(
+; CHECK-I32X2:       {
+; CHECK-I32X2-NEXT:    .reg .pred %p<3>;
+; CHECK-I32X2-NEXT:    .reg .b32 %r<9>;
+; CHECK-I32X2-NEXT:    .reg .b64 %rd<4>;
+; CHECK-I32X2-EMPTY:
+; CHECK-I32X2-NEXT:  // %bb.0:
+; CHECK-I32X2-NEXT:    ld.param.b64 %rd3, [test_eq_param_2];
+; CHECK-I32X2-NEXT:    ld.param.b64 %rd2, [test_eq_param_1];
+; CHECK-I32X2-NEXT:    ld.param.b64 %rd1, [test_eq_param_0];
+; CHECK-I32X2-NEXT:    mov.b64 {%r1, %r2}, %rd2;
+; CHECK-I32X2-NEXT:    mov.b64 {%r3, %r4}, %rd1;
+; CHECK-I32X2-NEXT:    setp.eq.b32 %p1, %r3, %r1;
+; CHECK-I32X2-NEXT:    setp.eq.b32 %p2, %r4, %r2;
+; CHECK-I32X2-NEXT:    mov.b64 {%r5, %r6}, %rd3;
+; CHECK-I32X2-NEXT:    selp.b32 %r7, %r4, %r6, %p2;
+; CHECK-I32X2-NEXT:    selp.b32 %r8, %r3, %r5, %p1;
+; CHECK-I32X2-NEXT:    st.param.v2.b32 [func_retval0], {%r8, %r7};
+; CHECK-I32X2-NEXT:    ret;
+  %cmp = icmp eq <2 x i32> %a, %b
+  %r = select <2 x i1> %cmp, <2 x i32> %a, <2 x i32> %c
+  ret <2 x i32> %r
+}
+
+define <2 x i32> @test_ne(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) #0 {
+; CHECK-NOI32X2-LABEL: test_ne(
+; CHECK-NOI32X2:       {
+; CHECK-NOI32X2-NEXT:    .reg .pred %p<3>;
+; CHECK-NOI32X2-NEXT:    .reg .b32 %r<9>;
+; CHECK-NOI32X2-EMPTY:
+; CHECK-NOI32X2-NEXT:  // %bb.0:
+; CHECK-NOI32X2-NEXT:    ld.param.v2.b32 {%r5, %r6}, [test_ne_param_2];
+; CHECK-NOI32X2-NEXT:    ld.param.v2.b32 {%r3, %r4}, [test_ne_param_1];
+; CHECK-NOI32X2-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_ne_param_0];
+; CHECK-NOI32X2-NEXT:    setp.ne.b32 %p1, %r1, %r3;
+; CHECK-NOI32X2-NEXT:    setp.ne.b32 %p2, %r2, %r4;
+; CHECK-NOI32X2-NEXT:    selp.b32 %r7, %r2, %r6, %p2;
+; CHECK-NOI32X2-NEXT:    selp.b32 %r8, %r1, %r5, %p1;
+; CHECK-NOI32X2-NEXT:    st.param.v2.b32 [func_retval0], {%r8, %r7};
+; CHECK-NOI32X2-NEXT:    ret;
+;
+; CHECK-I32X2-LABEL: test_ne(
+; CHECK-I32X2:       {
+; CHECK-I32X2-NEXT:    .reg .pred %p<3>;
+; CHECK-I32X2-NEXT:    .reg .b32 %r<9>;
+; CHECK-I32X2-NEXT:    .reg .b64 %rd<4>;
+; CHECK-I32X2-EMPTY:
+; CHECK-I32X2-NEXT:  // %bb.0:
+; CHECK-I32X2-NEXT:    ld.param.b64 %rd3, [test_ne_param_2];
+; CHECK-I32X2-NEXT:    ld.param.b64 %rd2, [test_ne_param_1];
+; CHECK-I32X2-NEXT:    ld.param.b64 %rd1, [test_ne_param_0];
+; CHECK-I32X2-NEXT:    mov.b64 {%r1, %r2}, %rd2;
+; CHECK-I32X2-NEXT:    mov.b64 {%r3, %r4}, %rd1;
+; CHECK-I32X2-NEXT:    setp.ne.b32 %p1, %r3, %r1;
+; CHECK-I32X2-NEXT:    setp.ne.b32 %p2, %r4, %r2;
+; CHECK-I32X2-NEXT:    mov.b64 {%r5, %r6}, %rd3;
+; CHECK-I32X2-NEXT:    selp.b32 %r7, %r4, %r6, %p2;
+; CHECK-I32X2-NEXT:    selp.b32 %r8, %r3, %r5, %p1;
+; CHECK-I32X2-NEXT:    st.param.v2.b32 [func_retval0], {%r8, %r7};
+; CHECK-I32X2-NEXT:    ret;
+  %cmp = icmp ne <2 x i32> %a, %b
+  %r = select <2 x i1> %cmp, <2 x i32> %a, <2 x i32> %c
+  ret <2 x i32> %r
+}
+
+define <2 x i32> @test_mul(<2 x i32> %a, <2 x i32> %b) #0 {
+; CHECK-NOI32X2-LABEL: test_mul(
+; CHECK-NOI32X2:       {
+; CHECK-NOI32X2-NEXT:    .reg .b32 %r<7>;
+; CHECK-NOI32X2-EMPTY:
+; CHECK-NOI32X2-NEXT:  // %bb.0:
+; CHECK-NOI32X2-NEXT:    ld.param.v2.b32 {%r3, %r4}, [test_mul_param_1];
+; CHECK-NOI32X2-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_mul_param_0];
+; CHECK-NOI32X2-NEXT:    mul.lo.s32 %r5, %r2, %r4;
+; CHECK-NOI32X2-NEXT:    mul.lo.s32 %r6, %r1, %r3;
+; CHECK-NOI32X2-NEXT:    st.param.v2.b32 [func_retval0], {%r6, %r5};
+; CHECK-NOI32X2-NEXT:    ret;
+;
+; CHECK-I32X2-LABEL: test_mul(
+; CHECK-I32X2:       {
+; CHECK-I32X2-NEXT:    .reg .b32 %r<7>;
+; CHECK-I32X2-NEXT:    .reg .b64 %rd<3>;
+; CHECK-I32X2-EMPTY:
+; CHECK-I32X2-NEXT:  // %bb.0:
+; CHECK-I32X2-NEXT:    ld.param.b64 %rd2, [test_mul_param_1];
+; CHECK-I32X2-NEXT:    ld.param.b64 %rd1, [test_mul_param_0];
+; CHECK-I32X2-NEXT:    mov.b64 {%r1, %r2}, %rd2;
+; CHECK-I32X2-NEXT:    mov.b64 {%r3, %r4}, %rd1;
+; CHECK-I32X2-NEXT:    mul.lo.s32 %r5, %r4, %r2;
+; CHECK-I32X2-NEXT:    mul.lo.s32 %r6, %r3, %r1;
+; CHECK-I32X2-NEXT:    st.param.v2.b32 [func_retval0], {%r6, %r5};
+; CHECK-I32X2-NEXT:    ret;
+  %r = mul <2 x i32> %a, %b
+  ret <2 x i32> %r
+}
+
+define <2 x i32> @test_or(<2 x i32> %a, <2 x i32> %b) #0 {
+; CHECK-NOI32X2-LABEL: test_or(
+; CHECK-NOI32X2:       {
+; CHECK-NOI32X2-NEXT:    .reg .b32 %r<7>;
+; CHECK-NOI32X2-EMPTY:
+; CHECK-NOI32X2-NEXT:  // %bb.0:
+; CHECK-NOI32X2-NEXT:    ld.param.v2.b32 {%r3, %r4}, [test_or_param_1];
+; CHECK-NOI32X2-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_or_param_0];
+; CHECK-NOI32X2-NEXT:    or.b32 %r5, %r2, %r4;
+; CHECK-NOI32X2-NEXT:    or.b32 %r6, %r1, %r3;
+; CHECK-NOI32X2-NEXT:    st.param.v2.b32 [func_retval0], {%r6, %r5};
+; CHECK-NOI32X2-NEXT:    ret;
+;
+; CHECK-I32X2-LABEL: test_or(
+; CHECK-I32X2:       {
+; CHECK-I32X2-NEXT:    .reg .b32 %r<7>;
+; CHECK-I32X2-NEXT:    .reg .b64 %rd<3>;
+; CHECK-I32X2-EMPTY:
+; CHECK-I32X2-NEXT:  // %bb.0:
+; CHECK-I32X2-NEXT:    ld.param.b64 %rd2, [test_or_param_1];
+; CHECK-I32X2-NEXT:    ld.param.b64 %rd1, [test_or_param_0];
+; CHECK-I32X2-NEXT:    mov.b64 {%r1, %r2}, %rd2;
+; CHECK-I32X2-NEXT:    mov.b64 {%r3, %r4}, %rd1;
+; CHECK-I32X2-NEXT:    or.b32 %r5, %r4, %r2;
+; CHECK-I32X2-NEXT:    or.b32 %r6, %r3, %r1;
+; CHECK-I32X2-NEXT:    st.param.v2.b32 [func_retval0], {%r6, %r5};
+; CHECK-I32X2-NEXT:    ret;
+  %r = or <2 x i32> %a, %b
+  ret <2 x i32> %r
+}
+
+define <2 x i32> @test_or_computed(i32 %a) {
+; CHECK-LABEL: test_or_computed(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [test_or_computed_param_0];
+; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {%r1, 5};
+; CHECK-NEXT:    ret;
+  %ins.0 = insertelement <2 x i32> zeroinitializer, i32 %a, i32 0
+  %ins.1 = insertelement <2 x i32> %ins.0, i32 5, i32 1
+  %r = or <2 x i32> %ins.1, %ins.0
+  ret <2 x i32> %r
+}
+
+define <2 x i32> @test_or_imm_0(<2 x i32> %a) #0 {
+; CHECK-NOI32X2-LABEL: test_or_imm_0(
+; CHECK-NOI32X2:       {
+; CHECK-NOI32X2-NEXT:    .reg .b32 %r<5>;
+; CHECK-NOI32X2-EMPTY:
+; CHECK-NOI32X2-NEXT:  // %bb.0:
+; CHECK-NOI32X2-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_or_imm_0_param_0];
+; CHECK-NOI32X2-NEXT:    or.b32 %r3, %r2, 2;
+; CHECK-NOI32X2-NEXT:    or.b32 %r4, %r1, 1;
+; CHECK-NOI32X2-NEXT:    st.param.v2.b32 [func_retval0], {%r4, %r3};
+; CHECK-NOI32X2-NEXT:    ret;
+;
+; CHECK-I32X2-LABEL: test_or_imm_0(
+; CHECK-I32X2:       {
+; CHECK-I32X2-NEXT:    .reg .b32 %r<5>;
+; CHECK-I32X2-NEXT:    .reg .b64 %rd<2>;
+; CHECK-I32X2-EMPTY:
+; CHECK-I32X2-NEXT:  // %bb.0:
+; CHECK-I32X2-NEXT:    ld.param.b64 %rd1, [test_or_imm_0_param_0];
+; CHECK-I32X2-NEXT:    mov.b64 {%r1, %r2}, %rd1;
+; CHECK-I32X2-NEXT:    or.b32 %r3, %r2, 2;
+; CHECK-I32X2-NEXT:    or.b32 %r4, %r1, 1;
+; CHECK-I32X2-NEXT:    st.param.v2.b32 [func_retval0], {%r4, %r3};
+; CHECK-I32X2-NEXT:    ret;
+  %r = or <2 x i32> <i32 1, i32 2>, %a
+  ret <2 x i32> %r
+}
+
+define <2 x i32> @test_or_imm_1(<2 x i32> %a) #0 {
+; CHECK-NOI32X2-LABEL: test_or_imm_1(
+; CHECK-NOI32X2:       {
+; CHECK-NOI32X2-NEXT:    .reg .b32 %r<5>;
+; CHECK-NOI32X2-EMPTY:
+; CHECK-NOI32X2-NEXT:  // %bb.0:
+; CHECK-NOI32X2-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_or_imm_1_param_0];
+; CHECK-NOI32X2-NEXT:    or.b32 %r3, %r2, 2;
+; CHECK-NOI32X2-NEXT:    or.b32 %r4, %r1, 1;
+; CHECK-NOI32X2-NEXT:    st.param.v2.b32 [func_retval0], {%r4, %r3};
+; CHECK-NOI32X2-NEXT:    ret;
+;
+; CHECK-I32X2-LABEL: test_or_imm_1(
+; CHECK-I32X2:       {
+; CHECK-I32X2-NEXT:    .reg .b32 %r<5>;
+; CHECK-I32X2-NEXT:    .reg .b64 %rd<2>;
+; CHECK-I32X2-EMPTY:
+; CHECK-I32X2-NEXT:  // %bb.0:
+; CHECK-I32X2-NEXT:    ld.param.b64 %rd1, [test_or_imm_1_param_0];
+; CHECK-I32X2-NEXT:    mov.b64 {%r1, %r2}, %rd1;
+; CHECK-I32X2-NEXT:    or.b32 %r3, %r2, 2;
+; CHECK-I32X2-NEXT:    or.b32 %r4, %r1, 1;
+; CHECK-I32X2-NEXT:    st.param.v2.b32 [func_retval0], {%r4, %r3};
+; CHECK-I32X2-NEXT:    ret;
+  %r = or <2 x i32> %a, <i32 1, i32 2>
+  ret <2 x i32> %r
+}
+
+define <2 x i32> @test_xor(<2 x i32> %a, <2 x i32> %b) #0 {
+; CHECK-NOI32X2-LABEL: test_xor(
+; CHECK-NOI32X2:       {
+; CHECK-NOI32X2-NEXT:    .reg .b32 %r<7>;
+; CHECK-NOI32X2-EMPTY:
+; CHECK-NOI32X2-NEXT:  // %bb.0:
+; CHECK-NOI32X2-NEXT:    ld.param.v2.b32 {%r3, %r4}, [test_xor_param_1];
+; CHECK-NOI32X2-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_xor_param_0];
+; CHECK-NOI32X2-NEXT:    xor.b32 %r5, %r2, %r4;
+; CHECK-NOI32X2-NEXT:    xor.b32 %r6, %r1, %r3;
+; CHECK-NOI32X2-NEXT:    st.param.v2.b32 [func_retval0], {%r6, %r5};
+; CHECK-NOI32X2-NEXT:    ret;
+;
+; CHECK-I32X2-LABEL: test_xor(
+; CHECK-I32X2:       {
+; CHECK-I32X2-NEXT:    .reg .b32 %r<7>;
+; CHECK-I32X2-NEXT:    .reg .b64 %rd<3>;
+; CHECK-I32X2-EMPTY:
+; CHECK-I32X2-NEXT:  // %bb.0:
+; CHECK-I32X2-NEXT:    ld.param.b64 %rd2, [test_xor_param_1];
+; CHECK-I32X2-NEXT:    ld.param.b64 %rd1, [test_xor_param_0];
+; CHECK-I32X2-NEXT:    mov.b64 {%r1, %r2}, %rd2;
+; CHECK-I32X2-NEXT:    mov.b64 {%r3, %r4}, %rd1;
+; CHECK-I32X2-NEXT:    xor.b32 %r5, %r4, %r2;
+; CHECK-I32X2-NEXT:    xor.b32 %r6, %r3, %r1;
+; CHECK-I32X2-NEXT:    st.param.v2.b32 [func_retval0], {%r6, %r5};
+; CHECK-I32X2-NEXT:    ret;
+  %r = xor <2 x i32> %a, %b
+  ret <2 x i32> %r
+}
+
+define <2 x i32> @test_xor_computed(i32 %a) {
+; CHECK-LABEL: test_xor_computed(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [test_xor_computed_param_0];
+; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {0, 5};
+; CHECK-NEXT:    ret;
+  %ins.0 = insertelement <2 x i32> zeroinitializer, i32 %a, i32 0
+  %ins.1 = insertelement <2 x i32> %ins.0, i32 5, i32 1
+  %r = xor <2 x i32> %ins.1, %ins.0
+  ret <2 x i32> %r
+}
+
+define <2 x i32> @test_xor_imm_0(<2 x i32> %a) #0 {
+; CHECK-NOI32X2-LABEL: test_xor_imm_0(
+; CHECK-NOI32X2:       {
+; CHECK-NOI32X2-NEXT:    .reg .b32 %r<5>;
+; CHECK-NOI32X2-EMPTY:
+; CHECK-NOI32X2-NEXT:  // %bb.0:
+; CHECK-NOI32X2-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_xor_imm_0_param_0];
+; CHECK-NOI32X2-NEXT:    xor.b32 %r3, %r2, 2;
+; CHECK-NOI32X2-NEXT:    xor.b32 %r4, %r1, 1;
+; CHECK-NOI32X2-NEXT:    st.param.v2.b32 [func_retval0], {%r4, %r3};
+; CHECK-NOI32X2-NEXT:    ret;
+;
+; CHECK-I32X2-LABEL: test_xor_imm_0(
+; CHECK-I32X2:       {
+; CHECK-I32X2-NEXT:    .reg .b32 %r<5>;
+; CHECK-I32X2-NEXT:    .reg .b64 %rd<2>;
+; CHECK-I32X2-EMPTY:
+; CHECK-I32X2-NEXT:  // %bb.0:
+; CHECK-I32X2-NEXT:    ld.param.b64 %rd1, [test_xor_imm_0_param_0];
+; CHECK-I32X2-NEXT:    mov.b64 {%r1, %r2}, %rd1;
+; CHECK-I32X2-NEXT:    xor.b32 %r3, %r2, 2;
+; CHECK-I32X2-NEXT:    xor.b32 %r4, %r1, 1;
+; CHECK-I32X2-NEXT:    st.param.v2.b32 [func_retval0], {%r4, %r3};
+; CHECK-I32X2-NEXT:    ret;
+  %r = xor <2 x i32> <i32 1, i32 2>, %a
+  ret <2 x i32> %r
+}
+
+define <2 x i32> @test_xor_imm_1(<2 x i32> %a) #0 {
+; CHECK-NOI32X2-LABEL: test_xor_imm_1(
+; CHECK-NOI32X2:       {
+; CHECK-NOI32X2-NEXT:    .reg .b32 %r<5>;
+; CHECK-NOI32X2-EMPTY:
+; CHECK-NOI32X2-NEXT:  // %bb.0:
+; CHECK-NOI32X2-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_xor_imm_1_param_0];
+; CHECK-NOI32X2-NEXT:    xor.b32 %r3, %r2, 2;
+; CHECK-NOI32X2-NEXT:    xor.b32 %r4, %r1, 1;
+; CHECK-NOI32X2-NEXT:    st.param.v2.b32 [func_retval0], {%r4, %r3};
+; CHECK-NOI32X2-NEXT:    ret;
+;
+; CHECK-I32X2-LABEL: test_xor_imm_1(
+; CHECK-I32X2:       {
+; CHECK-I32X2-NEXT:    .reg .b32 %r<5>;
+; CHECK-I32X2-NEXT:    .reg .b64 %rd<2>;
+; CHECK-I32X2-EMPTY:
+; CHECK-I32X2-NEXT:  // %bb.0:
+; CHECK-I32X2-NEXT:    ld.param.b64 %rd1, [test_xor_imm_1_param_0];
+; CHECK-I32X2-NEXT:    mov.b64 {%r1, %r2}, %rd1;
+; CHECK-I32X2-NEXT:    xor.b32 %r3, %r2, 2;
+; CHECK-I32X2-NEXT:    xor.b32 %r4, %r1, 1;
+; CHECK-I32X2-NEXT:    st.param.v2.b32 [func_retval0], {%r4, %r3};
+; CHECK-I32X2-NEXT:    ret;
+  %r = xor <2 x i32> %a, <i32 1, i32 2>
+  ret <2 x i32> %r
+}
+
+define <2 x i32> @test_and(<2 x i32> %a, <2 x i32> %b) #0 {
+; CHECK-NOI32X2-LABEL: test_and(
+; CHECK-NOI32X2:       {
+; CHECK-NOI32X2-NEXT:    .reg .b32 %r<7>;
+; CHECK-NOI32X2-EMPTY:
+; CHECK-NOI32X2-NEXT:  // %bb.0:
+; CHECK-NOI32X2-NEXT:    ld.param.v2.b32 {%r3, %r4}, [test_and_param_1];
+; CHECK-NOI32X2-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_and_param_0];
+; CHECK-NOI32X2-NEXT:    and.b32 %r5, %r2, %r4;
+; CHECK-NOI32X2-NEXT:    and.b32 %r6, %r1, %r3;
+; CHECK-NOI32X2-NEXT:    st.param.v2.b32 [func_retval0], {%r6, %r5};
+; CHECK-NOI32X2-NEXT:    ret;
+;
+; CHECK-I32X2-LABEL: test_and(
+; CHECK-I32X2:       {
+; CHECK-I32X2-NEXT:    .reg .b32 %r<7>;
+; CHECK-I32X2-NEXT:    .reg .b64 %rd<3>;
+; CHECK-I32X2-EMPTY:
+; CHECK-I32X2-NEXT:  // %bb.0:
+; CHECK-I32X2-NEXT:    ld.param.b64 %rd2, [test_and_param_1];
+; CHECK-I32X2-NEXT:    ld.param.b64 %rd1, [test_and_param_0];
+; CHECK-I32X2-NEXT:    mov.b64 {%r1, %r2}, %rd2;
+; CHECK-I32X2-NEXT:    mov.b64 {%r3, %r4}, %rd1;
+; CHECK-I32X2-NEXT:    and.b32 %r5, %r4, %r2;
+; CHECK-I32X2-NEXT:    and.b32 %r6, %r3, %r1;
+; CHECK-I32X2-NEXT:    st.param.v2.b32 [func_retval0], {%r6, %r5};
+; CHECK-I32X2-NEXT:    ret;
+  %r = and <2 x i32> %a, %b
+  ret <2 x i32> %r
+}
+
+define <2 x i32> @test_and_computed(i32 %a) {
+; CHECK-LABEL: test_and_computed(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [test_and_computed_param_0];
+; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {%r1, 0};
+; CHECK-NEXT:    ret;
+  %ins.0 = insertelement <2 x i32> zeroinitializer, i32 %a, i32 0
+  %ins.1 = insertelement <2 x i32> %ins.0, i32 5, i32 1
+  %r = and <2 x i32> %ins.1, %ins.0
+  ret <2 x i32> %r
+}
+
+define <2 x i32> @test_and_imm_0(<2 x i32> %a) #0 {
+; CHECK-NOI32X2-LABEL: test_and_imm_0(
+; CHECK-NOI32X2:       {
+; CHECK-NOI32X2-NEXT:    .reg .b32 %r<5>;
+; CHECK-NOI32X2-EMPTY:
+; CHECK-NOI32X2-NEXT:  // %bb.0:
+; CHECK-NOI32X2-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_and_imm_0_param_0];
+; CHECK-NOI32X2-NEXT:    and.b32 %r3, %r2, 2;
+; CHECK-NOI32X2-NEXT:    and.b32 %r4, %r1, 1;
+; CHECK-NOI32X2-NEXT:    st.param.v2.b32 [func_retval0], {%r4, %r3};
+; CHECK-NOI32X2-NEXT:    ret;
+;
+; CHECK-I32X2-LABEL: test_and_imm_0(
+; CHECK-I32X2:       {
+; CHECK-I32X2-NEXT:    .reg .b32 %r<5>;
+; CHECK-I32X2-NEXT:    .reg .b64 %rd<2>;
+; CHECK-I32X2-EMPTY:
+; CHECK-I32X2-NEXT:  // %bb.0:
+; CHECK-I32X2-NEXT:    ld.param.b64 %rd1, [test_and_imm_0_param_0];
+; CHECK-I32X2-NEXT:    mov.b64 {%r1, %r2}, %rd1;
+; CHECK-I32X2-NEXT:    and.b32 %r3, %r2, 2;
+; CHECK-I32X2-NEXT:    and.b32 %r4, %r1, 1;
+; CHECK-I32X2-NEXT:    st.param.v2.b32 [func_retval0], {%r4, %r3};
+; CHECK-I32X2-NEXT:    ret;
+  %r = and <2 x i32> <i32 1, i32 2>, %a
+  ret <2 x i32> %r
+}
+
+define <2 x i32> @test_and_imm_1(<2 x i32> %a) #0 {
+; CHECK-NOI32X2-LABEL: test_and_imm_1(
+; CHECK-NOI32X2:       {
+; CHECK-NOI32X2-NEXT:    .reg .b32 %r<5>;
+; CHECK-NOI32X2-EMPTY:
+; CHECK-NOI32X2-NEXT:  // %bb.0:
+; CHECK-NOI32X2-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_and_imm_1_param_0];
+; CHECK-NOI32X2-NEXT:    and.b32 %r3, %r2, 2;
+; CHECK-NOI32X2-NEXT:    and.b32 %r4, %r1, 1;
+; CHECK-NOI32X2-NEXT:    st.param.v2.b32 [func_retval0], {%r4, %r3};
+; CHECK-NOI32X2-NEXT:    ret;
+;
+; CHECK-I32X2-LABEL: test_and_imm_1(
+; CHECK-I32X2:       {
+; CHECK-I32X2-NEXT:    .reg .b32 %r<5>;
+; CHECK-I32X2-NEXT:    .reg .b64 %rd<2>;
+; CHECK-I32X2-EMPTY:
+; CHECK-I32X2-NEXT:  // %bb.0:
+; CHECK-I32X2-NEXT:    ld.param.b64 %rd1, [test_and_imm_1_param_0];
+; CHECK-I32X2-NEXT:    mov.b64 {%r1, %r2}, %rd1;
+; CHECK-I32X2-NEXT:    and.b32 %r3, %r2, 2;
+; CHECK-I32X2-NEXT:    and.b32 %r4, %r1, 1;
+; CHECK-I32X2-NEXT:    st.param.v2.b32 [func_retval0], {%r4, %r3};
+; CHECK-I32X2-NEXT:    ret;
+  %r = and <2 x i32> %a, <i32 1, i32 2>
+  ret <2 x i32> %r
+}
+
+define void @test_ldst_v2i32(ptr %a, ptr %b) {
+; CHECK-NOI32X2-LABEL: test_ldst_v2i32(
+; CHECK-NOI32X2:       {
+; CHECK-NOI32X2-NEXT:    .reg .b32 %r<3>;
+; CHECK-NOI32X2-NEXT:    .reg .b64 %rd<3>;
+; CHECK-NOI32X2-EMPTY:
+; CHECK-NOI32X2-NEXT:  // %bb.0:
+; CHECK-NOI32X2-NEXT:    ld.param.b64 %rd2, [test_ldst_v2i32_param_1];
+; CHECK-NOI32X2-NEXT:    ld.param.b64 %rd1, [test_ldst_v2i32_param_0];
+; CHECK-NOI32X2-NEXT:    ld.v2.b32 {%r1, %r2}, [%rd1];
+; CHECK-NOI32X2-NEXT:    st.v2.b32 [%rd2], {%r1, %r2};
+; CHECK-NOI32X2-NEXT:    ret;
+;
+; CHECK-I32X2-LABEL: test_ldst_v2i32(
+; CHECK-I32X2:       {
+; CHECK-I32X2-NEXT:    .reg .b64 %rd<4>;
+; CHECK-I32X2-EMPTY:
+; CHECK-I32X2-NEXT:  // %bb.0:
+; CHECK-I32X2-NEXT:    ld.param.b64 %rd2, [test_ldst_v2i32_param_1];
+; CHECK-I32X2-NEXT:    ld.param.b64 %rd1, [test_ldst_v2i32_param_0];
+; CHECK-I32X2-NEXT:    ld.b64 %rd3, [%rd1];
+; CHECK-I32X2-NEXT:    st.b64 [%rd2], %rd3;
+; CHECK-I32X2-NEXT:    ret;
+  %t1 = load <2 x i32>, ptr %a
+  store <2 x i32> %t1, ptr %b, align 16
+  ret void
+}
+
+define void @test_ldst_v3i32(ptr %a, ptr %b) {
+; CHECK-LABEL: test_ldst_v3i32(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-NEXT:    .reg .b64 %rd<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd2, [test_ldst_v3i32_param_1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_ldst_v3i32_param_0];
+; CHECK-NEXT:    ld.b64 %rd3, [%rd1];
+; CHECK-NEXT:    ld.b32 %r1, [%rd1+8];
+; CHECK-NEXT:    st.b32 [%rd2+8], %r1;
+; CHECK-NEXT:    st.b64 [%rd2], %rd3;
+; CHECK-NEXT:    ret;
+  %t1 = load <3 x i32>, ptr %a
+  store <3 x i32> %t1, ptr %b, align 16
+  ret void
+}
+
+define void @test_ldst_v4i32(ptr %a, ptr %b) {
+; CHECK-NOI32X2-LABEL: test_ldst_v4i32(
+; CHECK-NOI32X2:       {
+; CHECK-NOI32X2-NEXT:    .reg .b32 %r<5>;
+; CHECK-NOI32X2-NEXT:    .reg .b64 %rd<3>;
+; CHECK-NOI32X2-EMPTY:
+; CHECK-NOI32X2-NEXT:  // %bb.0:
+; CHECK-NOI32X2-NEXT:    ld.param.b64 %rd2, [test_ldst_v4i32_param_1];
+; CHECK-NOI32X2-NEXT:    ld.param.b64 %rd1, [test_ldst_v4i32_param_0];
+; CHECK-NOI32X2-NEXT:    ld.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
+; CHECK-NOI32X2-NEXT:    st.v4.b32 [%rd2], {%r1, %r2, %r3, %r4};
+; CHECK-NOI32X2-NEXT:    ret;
+;
+; CHECK-I32X2-LABEL: test_ldst_v4i32(
+; CHECK-I32X2:       {
+; CHECK-I32X2-NEXT:    .reg .b64 %rd<5>;
+; CHECK-I32X2-EMPTY:
+; CHECK-I32X2-NEXT:  // %bb.0:
+; CHECK-I32X2-NEXT:    ld.param.b64 %rd2, [test_ldst_v4i32_param_1];
+; CHECK-I32X2-NEXT:    ld.param.b64 %rd1, [test_ldst_v4i32_param_0];
+; CHECK-I32X2-NEXT:    ld.v2.b64 {%rd3, %rd4}, [%rd1];
+; CHECK-I32X2-NEXT:    st.v2.b64 [%rd2], {%rd3, %rd4};
+; CHECK-I32X2-NEXT:    ret;
+  %t1 = load <4 x i32>, ptr %a
+  store <4 x i32> %t1, ptr %b, align 16
+  ret void
+}
+
+define void @test_ldst_v2i32_unaligned(ptr %a, ptr %b) {
+; CHECK-NOI32X2-LABEL: test_ldst_v2i32_unaligned(
+; CHECK-NOI32X2:       {
+; CHECK-NOI32X2-NEXT:    .reg .b32 %r<13>;
+; CHECK-NOI32X2-NEXT:    .reg .b64 %rd<3>;
+; CHECK-NOI32X2-EMPTY:
+; CHECK-NOI32X2-NEXT:  // %bb.0:
+; CHECK-NOI32X2-NEXT:    ld.param.b64 %rd2, [test_ldst_v2i32_unaligned_param_1];
+; CHECK-NOI32X2-NEXT:    ld.param.b64 %rd1, [test_ldst_v2i32_unaligned_param_0];
+; CHECK-NOI32X2-NEXT:    ld.b8 %r1, [%rd1+2];
+; CHECK-NOI32X2-NEXT:    shl.b32 %r2, %r1, 16;
+; CHECK-NOI32X2-NEXT:    ld.b8 %r3, [%rd1+3];
+; CHECK-NOI32X2-NEXT:    shl.b32 %r4, %r3, 24;
+; CHECK-NOI32X2-NEXT:    or.b32 %r5, %r4, %r2;
+; CHECK-NOI32X2-NEXT:    ld.b8 %r6, [%rd1];
+; CHECK-NOI32X2-NEXT:    ld.b8 %r7, [%rd1+1];
+; CHECK-NOI32X2-NEXT:    ld.b8 %r8, [%rd1+4];
+; CHECK-NOI32X2-NEXT:    ld.b8 %r9, [%rd1+5];
+; CHECK-NOI32X2-NEXT:    ld.b8 %r10, [%rd1+6];
+; CHECK-NOI32X2-NEXT:    ld.b8 %r11, [%rd1+7];
+; CHECK-NOI32X2-NEXT:    st.b8 [%rd2+7], %r11;
+; CHECK-NOI32X2-NEXT:    st.b8 [%rd2+6], %r10;
+; CHECK-NOI32X2-NEXT:    st.b8 [%rd2+5], %r9;
+; CHECK-NOI32X2-NEXT:    st.b8 [%rd2+4], %r8;
+; CHECK-NOI32X2-NEXT:    st.b8 [%rd2+1], %r7;
+; CHECK-NOI32X2-NEXT:    st.b8 [%rd2], %r6;
+; CHECK-NOI32X2-NEXT:    st.b8 [%rd2+3], %r3;
+; CHECK-NOI32X2-NEXT:    shr.u32 %r12, %r5, 16;
+; CHECK-NOI32X2-NEXT:    st.b8 [%rd2+2], %r12;
+; CHECK-NOI32X2-NEXT:    ret;
+;
+; CHECK-I32X2-LABEL: test_ldst_v2i32_unaligned(
+; CHECK-I32X2:       {
+; CHECK-I32X2-NEXT:    .reg .b64 %rd<28>;
+; CHECK-I32X2-EMPTY:
+; CHECK-I32X2-NEXT:  // %bb.0:
+; CHECK-I32X2-NEXT:    ld.param.b64 %rd2, [test_ldst_v2i32_unaligned_param_1];
+; CHECK-I32X2-NEXT:    ld.param.b64 %rd1, [test_ldst_v2i32_unaligned_param_0];
+; CHECK-I32X2-NEXT:    ld.b8 %rd3, [%rd1];
+; CHECK-I32X2-NEXT:    ld.b8 %rd4, [%rd1+1];
+; CHECK-I32X2-NEXT:    shl.b64 %rd5, %rd4, 8;
+; CHECK-I32X2-NEXT:    or.b64 %rd6, %rd5, %rd3;
+; CHECK-I32X2-NEXT:    ld.b8 %rd7, [%rd1+2];
+; CHECK-I32X2-NEXT:    shl.b64 %rd8, %rd7, 16;
+; CHECK-I32X2-NEXT:    ld.b8 %rd9, [%rd1+3];
+; CHECK-I32X2-NEXT:    shl.b64 %rd10, %rd9, 24;
+; CHECK-I32X2-NEXT:    or.b64 %rd11, %rd10, %rd8;
+; CHECK-I32X2-NEXT:    or.b64 %rd12, %rd11, %rd6;
+; CHECK-I32X2-NEXT:    ld.b8 %rd13, [%rd1+4];
+; CHECK-I32X2-NEXT:    ld.b8 %rd14, [%rd1+5];
+; CHECK-I32X2-NEXT:    shl.b64 %rd15, %rd14, 8;
+; CHECK-I32X2-NEXT:    or.b64 %rd16, %rd15, %rd13;
+; CHECK-I32X2-NEXT:    ld.b8 %rd17, [%rd1+6];
+; CHECK-I32X2-NEXT:    shl.b64 %rd18, %rd17, 16;
+; CHECK-I32X2-NEXT:    ld.b8 %rd19, [%rd1+7];
+; CHECK-I32X2-NEXT:    shl.b64 %rd20, %rd19, 24;
+; CHECK-I32X2-NEXT:    or.b64 %rd21, %rd20, %rd18;
+; CHECK-I32X2-NEXT:    or.b64 %rd22, %rd21, %rd16;
+; CHECK-I32X2-NEXT:    shl.b64 %rd23, %rd22, 32;
+; CHECK-I32X2-NEXT:    or.b64 %rd24, %rd23, %rd12;
+; CHECK-I32X2-NEXT:    st.b8 [%rd2+6], %rd17;
+; CHECK-I32X2-NEXT:    shr.u64 %rd25, %rd24, 56;
+; CHECK-I32X2-NEXT:    st.b8 [%rd2+7], %rd25;
+; CHECK-I32X2-NEXT:    st.b8 [%rd2+4], %rd13;
+; CHECK-I32X2-NEXT:    shr.u64 %rd26, %rd24, 40;
+; CHECK-I32X2-NEXT:    st.b8 [%rd2+5], %rd26;
+; CHECK-I32X2-NEXT:    st.b8 [%rd2+1], %rd4;
+; CHECK-I32X2-NEXT:    st.b8 [%rd2], %rd3;
+; CHECK-I32X2-NEXT:    st.b8 [%rd2+3], %rd9;
+; CHECK-I32X2-NEXT:    shr.u64 %rd27, %rd24, 16;
+; CHECK-I32X2-NEXT:    st.b8 [%rd2+2], %rd27;
+; CHECK-I32X2-NEXT:    ret;
+  %t1 = load <2 x i32>, ptr %a, align 1
+  store <2 x i32> %t1, ptr %b, align 1
+  ret void
+}
+
+declare <2 x i32> @test_callee(<2 x i32> %a, <2 x i32> %b) #0
+
+define <2 x i32> @test_call(<2 x i32> %a, <2 x i32> %b) #0 {
+; CHECK-NOI32X2-LABEL: test_call(
+; CHECK-NOI32X2:       {
+; CHECK-NOI32X2-NEXT:    .reg .b32 %r<7>;
+; CHECK-NOI32X2-EMPTY:
+; CHECK-NOI32X2-NEXT:  // %bb.0:
+; CHECK-NOI32X2-NEXT:    ld.param.v2.b32 {%r3, %r4}, [test_call_param_1];
+; CHECK-NOI32X2-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_call_param_0];
+; CHECK-NOI32X2-NEXT:    { // callseq 0, 0
+; CHECK-NOI32X2-NEXT:    .param .align 8 .b8 param0[8];
+; CHECK-NOI32X2-NEXT:    .param .align 8 .b8 param1[8];
+; CHECK-NOI32X2-NEXT:    .param .align 8 .b8 retval0[8];
+; CHECK-NOI32X2-NEXT:    st.param.v2.b32 [param1], {%r3, %r4};
+; CHECK-NOI32X2-NEXT:    st.param.v2.b32 [param0], {%r1, %r2};
+; CHECK-NOI32X2-NEXT:    call.uni (retval0), test_callee, (param0, param1);
+; CHECK-NOI32X2-NEXT:    ld.param.v2.b32 {%r5, %r6}, [retval0];
+; CHECK-NOI32X2-NEXT:    } // callseq 0
+; CHECK-NOI32X2-NEXT:    st.param.v2.b32 [func_retval0], {%r5, %r6};
+; CHECK-NOI32X2-NEXT:    ret;
+;
+; CHECK-I32X2-LABEL: test_call(
+; CHECK-I32X2:       {
+; CHECK-I32X2-NEXT:    .reg .b64 %rd<4>;
+; CHECK-I32X2-EMPTY:
+; CHECK-I32X2-NEXT:  // %bb.0:
+; CHECK-I32X2-NEXT:    ld.param.b64 %rd2, [test_call_param_1];
+; CHECK-I32X2-NEXT:    ld.param.b64 %rd1, [test_call_param_0];
+; CHECK-I32X2-NEXT:    { // callseq 0, 0
+; CHECK-I32X2-NEXT:    .param .align 8 .b8 param0[8];
+; CHECK-I32X2-NEXT:    .param .align 8 .b8 param1[8];
+; CHECK-I32X2-NEXT:    .param .align 8 .b8 retval0[8];
+; CHECK-I32X2-NEXT:    st.param.b64 [param1], %rd2;
+; CHECK-I32X2-NEXT:    st.param.b64 [param0], %rd1;
+; CHECK-I32X2-NEXT:    call.uni (retval0), test_callee, (param0, param1);
+; CHECK-I32X2-NEXT:    ld.param.b64 %rd3, [retval0];
+; CHECK-I32X2-NEXT:    } // callseq 0
+; CHECK-I32X2-NEXT:    st.param.b64 [func_retval0], %rd3;
+; CHECK-I32X2-NEXT:    ret;
+  %r = call <2 x i32> @test_callee(<2 x i32> %a, <2 x i32> %b)
+  ret <2 x i32> %r
+}
+
+define <2 x i32> @test_call_flipped(<2 x i32> %a, <2 x i32> %b) #0 {
+; CHECK-NOI32X2-LABEL: test_call_flipped(
+; CHECK-NOI32X2:       {
+; CHECK-NOI32X2-NEXT:    .reg .b32 %r<7>;
+; CHECK-NOI32X2-EMPTY:
+; CHECK-NOI32X2-NEXT:  // %bb.0:
+; CHECK-NOI32X2-NEXT:    ld.param.v2.b32 {%r3, %r4}, [test_call_flipped_param_1];
+; CHECK-NOI32X2-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_call_flipped_param_0];
+; CHECK-NOI32X2-NEXT:    { // callseq 1, 0
+; CHECK-NOI32X2-NEXT:    .param .align 8 .b8 param0[8];
+; CHECK-NOI32X2-NEXT:    .param .align 8 .b8 param1[8];
+; CHECK-NOI32X2-NEXT:    .param .align 8 .b8 retval0[8];
+; CHECK-NOI32X2-NEXT:    st.param.v2.b32 [param1], {%r1, %r2};
+; CHECK-NOI32X2-NEXT:    st.param.v2.b32 [param0], {%r3, %r4};
+; CHECK-NOI32X2-NEXT:    call.uni (retval0), test_callee, (param0, param1);
+; CHECK-NOI32X2-NEXT:    ld.param.v2.b32 {%r5, %r6}, [retval0];
+; CHECK-NOI32X2-NEXT:    } // callseq 1
+; CHECK-NOI32X2-NEXT:    st.param.v2.b32 [func_retval0], {%r5, %r6};
+; CHECK-NOI32X2-NEXT:    ret;
+;
+; CHECK-I32X2-LABEL: test_call_flipped(
+; CHECK-I32X2:       {
+; CHECK-I32X2-NEXT:    .reg .b64 %rd<4>;
+; CHECK-I32X2-EMPTY:
+; CHECK-I32X2-NEXT:  // %bb.0:
+; CHECK-I32X2-NEXT:    ld.param.b64 %rd2, [test_call_flipped_param_1];
+; CHECK-I32X2-NEXT:    ld.param.b64 %rd1, [test_call_flipped_param_0];
+; CHECK-I32X2-NEXT:    { // callseq 1, 0
+; CHECK-I32X2-NEXT:    .param .align 8 .b8 param0[8];
+; CHECK-I32X2-NEXT:    .param .align 8 .b8 param1[8];
+; CHECK-I32X2-NEXT:    .param .align 8 .b8 retval0[8];
+; CHECK-I32X2-NEXT:    st.param.b64 [param1], %rd1;
+; CHECK-I32X2-NEXT:    st.param.b64 [param0], %rd2;
+; CHECK-I32X2-NEXT:    call.uni (retval0), test_callee, (param0, param1);
+; CHECK-I32X2-NEXT:    ld.param.b64 %rd3, [retval0];
+; CHECK-I32X2-NEXT:    } // callseq 1
+; CHECK-I32X2-NEXT:    st.param.b64 [func_retval0], %rd3;
+; CHECK-I32X2-NEXT:    ret;
+  %r = call <2 x i32> @test_callee(<2 x i32> %b, <2 x i32> %a)
+  ret <2 x i32> %r
+}
+
+define <2 x i32> @test_tailcall_flipped(<2 x i32> %a, <2 x i32> %b) #0 {
+; CHECK-NOI32X2-LABEL: test_tailcall_flipped(
+; CHECK-NOI32X2:       {
+; CHECK-NOI32X2-NEXT:    .reg .b32 %r<7>;
+; CHECK-NOI32X2-EMPTY:
+; CHECK-NOI32X2-NEXT:  // %bb.0:
+; CHECK-NOI32X2-NEXT:    ld.param.v2.b32 {%r3, %r4}, [test_tailcall_flipped_param_1];
+; CHECK-NOI32X2-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_tailcall_flipped_param_0];
+; CHECK-NOI32X2-NEXT:    { // callseq 2, 0
+; CHECK-NOI32X2-NEXT:    .param .align 8 .b8 param0[8];
+; CHECK-NOI32X2-NEXT:    .param .align 8 .b8 param1[8];
+; CHECK-NOI32X2-NEXT:    .param .align 8 .b8 retval0[8];
+; CHECK-NOI32X2-NEXT:    st.param.v2.b32 [param1], {%r1, %r2};
+; CHECK-NOI32X2-NEXT:    st.param.v2.b32 [param0], {%r3, %r4};
+; CHECK-NOI32X2-NEXT:    call.uni (retval0), test_callee, (param0, param1);
+; CHECK-NOI32X2-NEXT:    ld.param.v2.b32 {%r5, %r6}, [retval0];
+; CHECK-NOI32X2-NEXT:    } // callseq 2
+; CHECK-NOI32X2-NEXT:    st.param.v2.b32 [func_retval0], {%r5, %r6};
+; CHECK-NOI32X2-NEXT:    ret;
+;
+; CHECK-I32X2-LABEL: test_tailcall_flipped(
+; CHECK-I32X2:       {
+; CHECK-I32X2-NEXT:    .reg .b64 %rd<4>;
+; CHECK-I32X2-EMPTY:
+; CHECK-I32X2-NEXT:  // %bb.0:
+; CHECK-I32X2-NEXT:    ld.param.b64 %rd2, [test_tailcall_flipped_param_1];
+; CHECK-I32X2-NEXT:    ld.param.b64 %rd1, [test_tailcall_flipped_param_0];
+; CHECK-I32X2-NEXT:    { // callseq 2, 0
+; CHECK-I32X2-NEXT:    .param .align 8 .b8 param0[8];
+; CHECK-I32X2-NEXT:    .param .align 8 .b8 param1[8];
+; CHECK-I32X2-NEXT:    .param .align 8 .b8 retval0[8];
+; CHECK-I32X2-NEXT:    st.param.b64 [param1], %rd1;
+; CHECK-I32X2-NEXT:    st.param.b64 [param0], %rd2;
+; CHECK-I32X2-NEXT:    call.uni (retval0), test_callee, (param0, param1);
+; CHECK-I32X2-NEXT:    ld.param.b64 %rd3, [retval0];
+; CHECK-I32X2-NEXT:    } // callseq 2
+; CHECK-I32X2-NEXT:    st.param.b64 [func_retval0], %rd3;
+; CHECK-I32X2-NEXT:    ret;
+  %r = tail call <2 x i32> @test_callee(<2 x i32> %b, <2 x i32> %a)
+  ret <2 x i32> %r
+}
+
+define <2 x i32> @test_select(<2 x i32> %a, <2 x i32> %b, i1 zeroext %c) #0 {
+; CHECK-NOI32X2-LABEL: test_select(
+; CHECK-NOI32X2:       {
+; CHECK-NOI32X2-NEXT:    .reg .pred %p<2>;
+; CHECK-NOI32X2-NEXT:    .reg .b16 %rs<3>;
+; CHECK-NOI32X2-NEXT:    .reg .b32 %r<7>;
+; CHECK-NOI32X2-EMPTY:
+; CHECK-NOI32X2-NEXT:  // %bb.0:
+; CHECK-NOI32X2-NEXT:    ld.param.b8 %rs1, [test_select_param_2];
+; CHECK-NOI32X2-NEXT:    and.b16 %rs2, %rs1, 1;
+; CHECK-NOI32X2-NEXT:    setp.ne.b16 %p1, %rs2, 0;
+; CHECK-NOI32X2-NEXT:    ld.param.v2.b32 {%r3, %r4}, [test_select_param_1];
+; CHECK-NOI32X2-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_select_param_0];
+; CHECK-NOI32X2-NEXT:    selp.b32 %r5, %r2, %r4, %p1;
+; CHECK-NOI32X2-NEXT:    selp.b32 %r6, %r1, %r3, %p1;
+; CHECK-NOI32X2-NEXT:    st.param.v2.b32 [func_retval0], {%r6, %r5};
+; CHECK-NOI32X2-NEXT:    ret;
+;
+; CHECK-I32X2-LABEL: test_select(
+; CHECK-I32X2:       {
+; CHECK-I32X2-NEXT:    .reg .pred %p<2>;
+; CHECK-I32X2-NEXT:    .reg .b16 %rs<3>;
+; CHECK-I32X2-NEXT:    .reg .b64 %rd<4>;
+; CHECK-I32X2-EMPTY:
+; CHECK-I32X2-NEXT:  // %bb.0:
+; CHECK-I32X2-NEXT:    ld.param.b8 %rs1, [test_select_param_2];
+; CHECK-I32X2-NEXT:    and.b16 %rs2, %rs1, 1;
+; CHECK-I32X2-NEXT:    setp.ne.b16 %p1, %rs2, 0;
+; CHECK-I32X2-NEXT:    ld.param.b64 %rd2, [test_select_param_1];
+; CHECK-I32X2-NEXT:    ld.param.b64 %rd1, [test_select_param_0];
+; CHECK-I32X2-NEXT:    selp.b64 %rd3, %rd1, %rd2, %p1;
+; CHECK-I32X2-NEXT:    st.param.b64 [func_retval0], %rd3;
+; CHECK-I32X2-NEXT:    ret;
+  %r = select i1 %c, <2 x i32> %a, <2 x i32> %b
+  ret <2 x i32> %r
+}
+
+define <2 x i32> @test_select_cc(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c, <2 x i32> %d) #0 {
+; CHECK-NOI32X2-LABEL: test_select_cc(
+; CHECK-NOI32X2:       {
+; CHECK-NOI32X2-NEXT:    .reg .pred %p<3>;
+; CHECK-NOI32X2-NEXT:    .reg .b32 %r<11>;
+; CHECK-NOI32X2-EMPTY:
+; CHECK-NOI32X2-NEXT:  // %bb.0:
+; CHECK-NOI32X2-NEXT:    ld.param.v2.b32 {%r7, %r8}, [test_select_cc_param_3];
+; CHECK-NOI32X2-NEXT:    ld.param.v2.b32 {%r5, %r6}, [test_select_cc_param_2];
+; CHECK-NOI32X2-NEXT:    ld.param.v2.b32 {%r3, %r4}, [test_select_cc_param_1];
+; CHECK-NOI32X2-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_select_cc_param_0];
+; CHECK-NOI32X2-NEXT:    setp.ne.b32 %p1, %r5, %r7;
+; CHECK-NOI32X2-NEXT:    setp.ne.b32 %p2, %r6, %r8;
+; CHECK-NOI32X2-NEXT:    selp.b32 %r9, %r2, %r4, %p2;
+; CHECK-NOI32X2-NEXT:    selp.b32 %r10, %r1, %r3, %p1;
+; CHECK-NOI32X2-NEXT:    st.param.v2.b32 [func_retval0], {%r10, %r9};
+; CHECK-NOI32X2-NEXT:    ret;
+;
+; CHECK-I32X2-LABEL: test_select_cc(
+; CHECK-I32X2:       {
+; CHECK-I32X2-NEXT:    .reg .pred %p<3>;
+; CHECK-I32X2-NEXT:    .reg .b32 %r<11>;
+; CHECK-I32X2-NEXT:    .reg .b64 %rd<5>;
+; CHECK-I32X2-EMPTY:
+; CHECK-I32X2-NEXT:  // %bb.0:
+; CHECK-I32X2-NEXT:    ld.param.b64 %rd4, [test_select_cc_param_3];
+; CHECK-I32X2-NEXT:    ld.param.b64 %rd3, [test_select_cc_param_2];
+; CHECK-I32X2-NEXT:    ld.param.b64 %rd2, [test_select_cc_param_1];
+; CHECK-I32X2-NEXT:    ld.param.b64 %rd1, [test_select_cc_param_0];
+; CHECK-I32X2-NEXT:    mov.b64 {%r1, %r2}, %rd4;
+; CHECK-I32X2-NEXT:    mov.b64 {%r3, %r4}, %rd3;
+; CHECK-I32X2-NEXT:    setp.ne.b32 %p1, %r3, %r1;
+; CHECK-I32X2-NEXT:    setp.ne.b32 %p2, %r4, %r2;
+; CHECK-I32X2-NEXT:    mov.b64 {%r5, %r6}, %rd2;
+; CHECK-I32X2-NEXT:    mov.b64 {%r7, %r8}, %rd1;
+; CHECK-I32X2-NEXT:    selp.b32 %r9, %r8, %r6, %p2;
+; CHECK-I32X2-NEXT:    selp.b32 %r10, %r7, %r5, %p1;
+; CHECK-I32X2-NEXT:    st.param.v2.b32 [func_retval0], {%r10, %r9};
+; CHECK-I32X2-NEXT:    ret;
+  %cc = icmp ne <2 x i32> %c, %d
+  %r = select <2 x i1> %cc, <2 x i32> %a, <2 x i32> %b
+  ret <2 x i32> %r
+}
+
+define <2 x i16> @test_trunc_2xi32(<2 x i32> %a) #0 {
+; CHECK-NOI32X2-LABEL: test_trunc_2xi32(
+; CHECK-NOI32X2:       {
+; CHECK-NOI32X2-NEXT:    .reg .b32 %r<4>;
+; CHECK-NOI32X2-EMPTY:
+; CHECK-NOI32X2-NEXT:  // %bb.0:
+; CHECK-NOI32X2-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_trunc_2xi32_param_0];
+; CHECK-NOI32X2-NEXT:    prmt.b32 %r3, %r1, %r2, 0x5410U;
+; CHECK-NOI32X2-NEXT:    st.param.b32 [func_retval0], %r3;
+; CHECK-NOI32X2-NEXT:    ret;
+;
+; CHECK-I32X2-LABEL: test_trunc_2xi32(
+; CHECK-I32X2:       {
+; CHECK-I32X2-NEXT:    .reg .b64 %rd<2>;
+; CHECK-I32X2-EMPTY:
+; CHECK-I32X2-NEXT:  // %bb.0:
+; CHECK-I32X2-NEXT:    ld.param.b64 %rd1, [test_trunc_2xi32_param_0];
+; CHECK-I32X2-NEXT:    st.param.b32 [func_retval0], %rd1;
+; CHECK-I32X2-NEXT:    ret;
+  %r = trunc <2 x i32> %a to <2 x i16>
+  ret <2 x i16> %r
+}
+
+define <2 x i32> @test_trunc_2xi64(<2 x i64> %a) #0 {
+; CHECK-LABEL: test_trunc_2xi64(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<3>;
+; CHECK-NEXT:    .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.v2.b64 {%rd1, %rd2}, [test_trunc_2xi64_param_0];
+; CHECK-NEXT:    cvt.u32.u64 %r1, %rd2;
+; CHECK-NEXT:    cvt.u32.u64 %r2, %rd1;
+; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {%r2, %r1};
+; CHECK-NEXT:    ret;
+  %r = trunc <2 x i64> %a to <2 x i32>
+  ret <2 x i32> %r
+}
+
+define <2 x i32> @test_zext_2xi32(<2 x i16> %a) #0 {
+; CHECK-LABEL: test_zext_2xi32(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<3>;
+; CHECK-NEXT:    .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [test_zext_2xi32_param_0];
+; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
+; CHECK-NEXT:    cvt.u32.u16 %r2, %rs2;
+; CHECK-NEXT:    cvt.u32.u16 %r3, %rs1;
+; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {%r3, %r2};
+; CHECK-NEXT:    ret;
+  %r = zext <2 x i16> %a to <2 x i32>
+  ret <2 x i32> %r
+}
+
+define <2 x i64> @test_zext_2xi64(<2 x i32> %a) #0 {
+; CHECK-NOI32X2-LABEL: test_zext_2xi64(
+; CHECK-NOI32X2:       {
+; CHECK-NOI32X2-NEXT:    .reg .b32 %r<3>;
+; CHECK-NOI32X2-NEXT:    .reg .b64 %rd<3>;
+; CHECK-NOI32X2-EMPTY:
+; CHECK-NOI32X2-NEXT:  // %bb.0:
+; CHECK-NOI32X2-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_zext_2xi64_param_0];
+; CHECK-NOI32X2-NEXT:    cvt.u64.u32 %rd1, %r2;
+; CHECK-NOI32X2-NEXT:    cvt.u64.u32 %rd2, %r1;
+; CHECK-NOI32X2-NEXT:    st.param.v2.b64 [func_retval0], {%rd2, %rd1};
+; CHECK-NOI32X2-NEXT:    ret;
+;
+; CHECK-I32X2-LABEL: test_zext_2xi64(
+; CHECK-I32X2:       {
+; CHECK-I32X2-NEXT:    .reg .b32 %r<3>;
+; CHECK-I32X2-NEXT:    .reg .b64 %rd<4>;
+; CHECK-I32X2-EMPTY:
+; CHECK-I32X2-NEXT:  // %bb.0:
+; CHECK-I32X2-NEXT:    ld.param.b64 %rd1, [test_zext_2xi64_param_0];
+; CHECK-I32X2-NEXT:    mov.b64 {%r1, %r2}, %rd1;
+; CHECK-I32X2-NEXT:    cvt.u64.u32 %rd2, %r2;
+; CHECK-I32X2-NEXT:    cvt.u64.u32 %rd3, %r1;
+; CHECK-I32X2-NEXT:    st.param.v2.b64 [func_retval0], {%rd3, %rd2};
+; CHECK-I32X2-NEXT:    ret;
+  %r = zext <2 x i32> %a to <2 x i64>
+  ret <2 x i64> %r
+}
+
+define <2 x i32> @test_bitcast_i64_to_2xi32(i64 %a) #0 {
+; CHECK-LABEL: test_bitcast_i64_to_2xi32(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_bitcast_i64_to_2xi32_param_0];
+; CHECK-NEXT:    st.param.b64 [func_retval0], %rd1;
+; CHECK-NEXT:    ret;
+  %r = bitcast i64 %a to <2 x i32>
+  ret <2 x i32> %r
+}
+
+define <2 x i32> @test_bitcast_double_to_2xi32(double %a) #0 {
+; CHECK-LABEL: test_bitcast_double_to_2xi32(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_bitcast_double_to_2xi32_param_0];
+; CHECK-NEXT:    st.param.b64 [func_retval0], %rd1;
+; CHECK-NEXT:    ret;
+  %r = bitcast double %a to <2 x i32>
+  ret <2 x i32> %r
+}
+
+define i64 @test_bitcast_2xi32_to_i64(<2 x i32> %a) #0 {
+; CHECK-NOI32X2-LABEL: test_bitcast_2xi32_to_i64(
+; CHECK-NOI32X2:       {
+; CHECK-NOI32X2-NEXT:    .reg .b32 %r<3>;
+; CHECK-NOI32X2-EMPTY:
+; CHECK-NOI32X2-NEXT:  // %bb.0:
+; CHECK-NOI32X2-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_bitcast_2xi32_to_i64_param_0];
+; CHECK-NOI32X2-NEXT:    st.param.v2.b32 [func_retval0], {%r1, %r2};
+; CHECK-NOI32X2-NEXT:    ret;
+;
+; CHECK-I32X2-LABEL: test_bitcast_2xi32_to_i64(
+; CHECK-I32X2:       {
+; CHECK-I32X2-NEXT:    .reg .b64 %rd<2>;
+; CHECK-I32X2-EMPTY:
+; CHECK-I32X2-NEXT:  // %bb.0:
+; CHECK-I32X2-NEXT:    ld.param.b64 %rd1, [test_bitcast_2xi32_to_i64_param_0];
+; CHECK-I32X2-NEXT:    st.param.b64 [func_retval0], %rd1;
+; CHECK-I32X2-NEXT:    ret;
+  %r = bitcast <2 x i32> %a to i64
+  ret i64 %r
+}
+
+define double @test_bitcast_2xi32_to_double(<2 x i32> %a) #0 {
+; CHECK-NOI32X2-LABEL: test_bitcast_2xi32_to_double(
+; CHECK-NOI32X2:       {
+; CHECK-NOI32X2-NEXT:    .reg .b32 %r<3>;
+; CHECK-NOI32X2-EMPTY:
+; CHECK-NOI32X2-NEXT:  // %bb.0:
+; CHECK-NOI32X2-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_bitcast_2xi32_to_double_param_0];
+; CHECK-NOI32X2-NEXT:    st.param.v2.b32 [func_retval0], {%r1, %r2};
+; CHECK-NOI32X2-NEXT:    ret;
+;
+; CHECK-I32X2-LABEL: test_bitcast_2xi32_to_double(
+; CHECK-I32X2:       {
+; CHECK-I32X2-NEXT:    .reg .b64 %rd<2>;
+; CHECK-I32X2-EMPTY:
+; CHECK-I32X2-NEXT:  // %bb.0:
+; CHECK-I32X2-NEXT:    ld.param.b64 %rd1, [test_bitcast_2xi32_to_double_param_0];
+; CHECK-I32X2-NEXT:    st.param.b64 [func_retval0], %rd1;
+; CHECK-I32X2-NEXT:    ret;
+  %r = bitcast <2 x i32> %a to double
+  ret double %r
+}
+
+
+define <4 x half> @test_bitcast_2xi32_to_4xhalf(i32 %a) #0 {
+; CHECK-LABEL: test_bitcast_2xi32_to_4xhalf(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [test_bitcast_2xi32_to_4xhalf_param_0];
+; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {%r1, 5};
+; CHECK-NEXT:    ret;
+  %ins.0 = insertelement <2 x i32> poison, i32 %a, i32 0
+  %ins.1 = insertelement <2 x i32> %ins.0, i32 5, i32 1
+  %r = bitcast <2 x i32> %ins.1 to <4 x half>
+  ret <4 x half> %r
+}
+
+
+define <2 x i32> @test_shufflevector(<2 x i32> %a) #0 {
+; CHECK-NOI32X2-LABEL: test_shufflevector(
+; CHECK-NOI32X2:       {
+; CHECK-NOI32X2-NEXT:    .reg .b32 %r<3>;
+; CHECK-NOI32X2-EMPTY:
+; CHECK-NOI32X2-NEXT:  // %bb.0:
+; CHECK-NOI32X2-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_shufflevector_param_0];
+; CHECK-NOI32X2-NEXT:    st.param.v2.b32 [func_retval0], {%r2, %r1};
+; CHECK-NOI32X2-NEXT:    ret;
+;
+; CHECK-I32X2-LABEL: test_shufflevector(
+; CHECK-I32X2:       {
+; CHECK-I32X2-NEXT:    .reg .b32 %r<3>;
+; CHECK-I32X2-NEXT:    .reg .b64 %rd<2>;
+; CHECK-I32X2-EMPTY:
+; CHECK-I32X2-NEXT:  // %bb.0:
+; CHECK-I32X2-NEXT:    ld.param.b64 %rd1, [test_shufflevector_param_0];
+; CHECK-I32X2-NEXT:    mov.b64 {%r1, %r2}, %rd1;
+; CHECK-I32X2-NEXT:    st.param.v2.b32 [func_retval0], {%r2, %r1};
+; CHECK-I32X2-NEXT:    ret;
+  %s = shufflevector <2 x i32> %a, <2 x i32> poison, <2 x i32> <i32 1, i32 0>
+  ret <2 x i32> %s
+}
+
+define <2 x i32> @test_shufflevector_2(<2 x i32> %a, <2 x i32> %b) #0 {
+; CHECK-NOI32X2-LABEL: test_shufflevector_2(
+; CHECK-NOI32X2:       {
+; CHECK-NOI32X2-NEXT:    .reg .b32 %r<5>;
+; CHECK-NOI32X2-EMPTY:
+; CHECK-NOI32X2-NEXT:  // %bb.0:
+; CHECK-NOI32X2-NEXT:    ld.param.v2.b32 {%r3, %r4}, [test_shufflevector_2_param_1];
+; CHECK-NOI32X2-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_shufflevector_2_param_0];
+; CHECK-NOI32X2-NEXT:    st.param.v2.b32 [func_retval0], {%r2, %r4};
+; CHECK-NOI32X2-NEXT:    ret;
+;
+; CHECK-I32X2-LABEL: test_shufflevector_2(
+; CHECK-I32X2:       {
+; CHECK-I32X2-NEXT:    .reg .b32 %r<3>;
+; CHECK-I32X2-NEXT:    .reg .b64 %rd<3>;
+; CHECK-I32X2-EMPTY:
+; CHECK-I32X2-NEXT:  // %bb.0:
+; CHECK-I32X2-NEXT:    ld.param.b64 %rd2, [test_shufflevector_2_param_1];
+; CHECK-I32X2-NEXT:    ld.param.b64 %rd1, [test_shufflevector_2_param_0];
+; CHECK-I32X2-NEXT:    mov.b64 {_, %r1}, %rd2;
+; CHECK-I32X2-NEXT:    mov.b64 {_, %r2}, %rd1;
+; CHECK-I32X2-NEXT:    st.param.v2.b32 [func_retval0], {%r2, %r1};
+; CHECK-I32X2-NEXT:    ret;
+  %s = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
+  ret <2 x i32> %s
+}
+
+
+define <2 x i32> @test_insertelement(<2 x i32> %a, i32 %x) #0 {
+; CHECK-NOI32X2-LABEL: test_insertelement(
+; CHECK-NOI32X2:       {
+; CHECK-NOI32X2-NEXT:    .reg .b32 %r<4>;
+; CHECK-NOI32X2-EMPTY:
+; CHECK-NOI32X2-NEXT:  // %bb.0:
+; CHECK-NOI32X2-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_insertelement_param_0];
+; CHECK-NOI32X2-NEXT:    ld.param.b32 %r3, [test_insertelement_param_1];
+; CHECK-NOI32X2-NEXT:    st.param.v2.b32 [func_retval0], {%r1, %r3};
+; CHECK-NOI32X2-NEXT:    ret;
+;
+; CHECK-I32X2-LABEL: test_insertelement(
+; CHECK-I32X2:       {
+; CHECK-I32X2-NEXT:    .reg .b32 %r<3>;
+; CHECK-I32X2-NEXT:    .reg .b64 %rd<2>;
+; CHECK-I32X2-EMPTY:
+; CHECK-I32X2-NEXT:  // %bb.0:
+; CHECK-I32X2-NEXT:    ld.param.b32 %r1, [test_insertelement_param_1];
+; CHECK-I32X2-NEXT:    ld.param.b64 %rd1, [test_insertelement_param_0];
+; CHECK-I32X2-NEXT:    mov.b64 {%r2, _}, %rd1;
+; CHECK-I32X2-NEXT:    st.param.v2.b32 [func_retval0], {%r2, %r1};
+; CHECK-I32X2-NEXT:    ret;
+  %i = insertelement <2 x i32> %a, i32 %x, i64 1
+  ret <2 x i32> %i
+}
+
+define <2 x i32> @test_fptosi_2xhalf_to_2xi32(<2 x half> %a) #0 {
+; CHECK-LABEL: test_fptosi_2xhalf_to_2xi32(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<3>;
+; CHECK-NEXT:    .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [test_fptosi_2xhalf_to_2xi32_param_0];
+; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
+; CHECK-NEXT:    cvt.rzi.s32.f16 %r2, %rs2;
+; CHECK-NEXT:    cvt.rzi.s32.f16 %r3, %rs1;
+; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {%r3, %r2};
+; CHECK-NEXT:    ret;
+  %r = fptosi <2 x half> %a to <2 x i32>
+  ret <2 x i32> %r
+}
+
+define <2 x i32> @test_fptoui_2xhalf_to_2xi32(<2 x half> %a) #0 {
+; CHECK-LABEL: test_fptoui_2xhalf_to_2xi32(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<3>;
+; CHECK-NEXT:    .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [test_fptoui_2xhalf_to_2xi32_param_0];
+; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
+; CHECK-NEXT:    cvt.rzi.u32.f16 %r2, %rs2;
+; CHECK-NEXT:    cvt.rzi.u32.f16 %r3, %rs1;
+; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {%r3, %r2};
+; CHECK-NEXT:    ret;
+  %r = fptoui <2 x half> %a to <2 x i32>
+  ret <2 x i32> %r
+}
+
+define void @test_srem_v2i32(ptr %a, ptr %b, ptr %c) {
+; CHECK-LABEL: test_srem_v2i32(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<7>;
+; CHECK-NEXT:    .reg .b64 %rd<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0: // %entry
+; CHECK-NEXT:    ld.param.b64 %rd3, [test_srem_v2i32_param_2];
+; CHECK-NEXT:    ld.param.b64 %rd2, [test_srem_v2i32_param_1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_srem_v2i32_param_0];
+; CHECK-NEXT:    ld.v2.b32 {%r1, %r2}, [%rd1];
+; CHECK-NEXT:    ld.v2.b32 {%r3, %r4}, [%rd2];
+; CHECK-NEXT:    rem.s32 %r5, %r2, %r4;
+; CHECK-NEXT:    rem.s32 %r6, %r1, %r3;
+; CHECK-NEXT:    st.v2.b32 [%rd3], {%r6, %r5};
+; CHECK-NEXT:    ret;
+entry:
+  %t57 = load <2 x i32>, ptr %a, align 8
+  %t59 = load <2 x i32>, ptr %b, align 8
+  %x = srem <2 x i32> %t57, %t59
+  store <2 x i32> %x, ptr %c, align 8
+  ret void
+}
+
+define void @test_srem_v3i32(ptr %a, ptr %b, ptr %c) {
+; CHECK-NOI32X2-LABEL: test_srem_v3i32(
+; CHECK-NOI32X2:       {
+; CHECK-NOI32X2-NEXT:    .reg .b32 %r<10>;
+; CHECK-NOI32X2-NEXT:    .reg .b64 %rd<10>;
+; CHECK-NOI32X2-EMPTY:
+; CHECK-NOI32X2-NEXT:  // %bb.0: // %entry
+; CHECK-NOI32X2-NEXT:    ld.param.b64 %rd3, [test_srem_v3i32_param_2];
+; CHECK-NOI32X2-NEXT:    ld.param.b64 %rd2, [test_srem_v3i32_param_1];
+; CHECK-NOI32X2-NEXT:    ld.param.b64 %rd1, [test_srem_v3i32_param_0];
+; CHECK-NOI32X2-NEXT:    ld.b32 %r1, [%rd1+8];
+; CHECK-NOI32X2-NEXT:    ld.b64 %rd4, [%rd1];
+; CHECK-NOI32X2-NEXT:    { .reg .b32 tmp; mov.b64 {tmp, %r2}, %rd4; }
+; CHECK-NOI32X2-NEXT:    cvt.u32.u64 %r3, %rd4;
+; CHECK-NOI32X2-NEXT:    ld.b32 %r4, [%rd2+8];
+; CHECK-NOI32X2-NEXT:    ld.b64 %rd5, [%rd2];
+; CHECK-NOI32X2-NEXT:    { .reg .b32 tmp; mov.b64 {tmp, %r5}, %rd5; }
+; CHECK-NOI32X2-NEXT:    cvt.u32.u64 %r6, %rd5;
+; CHECK-NOI32X2-NEXT:    rem.s32 %r7, %r3, %r6;
+; CHECK-NOI32X2-NEXT:    cvt.u64.u32 %rd6, %r7;
+; CHECK-NOI32X2-NEXT:    rem.s32 %r8, %r2, %r5;
+; CHECK-NOI32X2-NEXT:    cvt.u64.u32 %rd7, %r8;
+; CHECK-NOI32X2-NEXT:    shl.b64 %rd8, %rd7, 32;
+; CHECK-NOI32X2-NEXT:    or.b64 %rd9, %rd6, %rd8;
+; CHECK-NOI32X2-NEXT:    rem.s32 %r9, %r1, %r4;
+; CHECK-NOI32X2-NEXT:    st.b32 [%rd3+8], %r9;
+; CHECK-NOI32X2-NEXT:    st.b64 [%rd3], %rd9;
+; CHECK-NOI32X2-NEXT:    ret;
+;
+; CHECK-I32X2-LABEL: test_srem_v3i32(
+; CHECK-I32X2:       {
+; CHECK-I32X2-NEXT:    .reg .b32 %r<10>;
+; CHECK-I32X2-NEXT:    .reg .b64 %rd<4>;
+; CHECK-I32X2-EMPTY:
+; CHECK-I32X2-NEXT:  // %bb.0: // %entry
+; CHECK-I32X2-NEXT:    ld.param.b64 %rd3, [test_srem_v3i32_param_2];
+; CHECK-I32X2-NEXT:    ld.param.b64 %rd2, [test_srem_v3i32_param_1];
+; CHECK-I32X2-NEXT:    ld.param.b64 %rd1, [test_srem_v3i32_param_0];
+; CHECK-I32X2-NEXT:    ld.v2.b32 {%r1, %r2}, [%rd1];
+; CHECK-I32X2-NEXT:    ld.b32 %r3, [%rd1+8];
+; CHECK-I32X2-NEXT:    ld.v2.b32 {%r4, %r5}, [%rd2];
+; CHECK-I32X2-NEXT:    ld.b32 %r6, [%rd2+8];
+; CHECK-I32X2-NEXT:    rem.s32 %r7, %r3, %r6;
+; CHECK-I32X2-NEXT:    rem.s32 %r8, %r2, %r5;
+; CHECK-I32X2-NEXT:    rem.s32 %r9, %r1, %r4;
+; CHECK-I32X2-NEXT:    st.v2.b32 [%rd3], {%r9, %r8};
+; CHECK-I32X2-NEXT:    st.b32 [%rd3+8], %r7;
+; CHECK-I32X2-NEXT:    ret;
+entry:
+  %t57 = load <3 x i32>, ptr %a, align 8
+  %t59 = load <3 x i32>, ptr %b, align 8
+  %x = srem <3 x i32> %t57, %t59
+  store <3 x i32> %x, ptr %c, align 8
+  ret void
+}
+
+define void @test_sext_v2i1_to_v2i32(ptr %a, ptr %b, ptr %c) {
+; CHECK-NOI32X2-LABEL: test_sext_v2i1_to_v2i32(
+; CHECK-NOI32X2:       {
+; CHECK-NOI32X2-NEXT:    .reg .pred %p<3>;
+; CHECK-NOI32X2-NEXT:    .reg .b32 %r<7>;
+; CHECK-NOI32X2-NEXT:    .reg .b64 %rd<4>;
+; CHECK-NOI32X2-EMPTY:
+; CHECK-NOI32X2-NEXT:  // %bb.0: // %entry
+; CHECK-NOI32X2-NEXT:    ld.param.b64 %rd3, [test_sext_v2i1_to_v2i32_param_2];
+; CHECK-NOI32X2-NEXT:    ld.param.b64 %rd2, [test_sext_v2i1_to_v2i32_param_1];
+; CHECK-NOI32X2-NEXT:    ld.param.b64 %rd1, [test_sext_v2i1_to_v2i32_param_0];
+; CHECK-NOI32X2-NEXT:    ld.b32 %r1, [%rd1];
+; CHECK-NOI32X2-NEXT:    ld.b32 %r2, [%rd1+4];
+; CHECK-NOI32X2-NEXT:    ld.b32 %r3, [%rd2];
+; CHECK-NOI32X2-NEXT:    ld.b32 %r4, [%rd2+4];
+; CHECK-NOI32X2-NEXT:    setp.gt.u32 %p1, %r2, %r4;
+; CHECK-NOI32X2-NEXT:    setp.gt.u32 %p2, %r1, %r3;
+; CHECK-NOI32X2-NEXT:    selp.b32 %r5, -1, 0, %p2;
+; CHECK-NOI32X2-NEXT:    selp.b32 %r6, -1, 0, %p1;
+; CHECK-NOI32X2-NEXT:    st.b32 [%rd3+4], %r6;
+; CHECK-NOI32X2-NEXT:    st.b32 [%rd3], %r5;
+; CHECK-NOI32X2-NEXT:    ret;
+;
+; CHECK-I32X2-LABEL: test_sext_v2i1_to_v2i32(
+; CHECK-I32X2:       {
+; CHECK-I32X2-NEXT:    .reg .pred %p<3>;
+; CHECK-I32X2-NEXT:    .reg .b32 %r<7>;
+; CHECK-I32X2-NEXT:    .reg .b64 %rd<14>;
+; CHECK-I32X2-EMPTY:
+; CHECK-I32X2-NEXT:  // %bb.0: // %entry
+; CHECK-I32X2-NEXT:    ld.param.b64 %rd3, [test_sext_v2i1_to_v2i32_param_2];
+; CHECK-I32X2-NEXT:    ld.param.b64 %rd2, [test_sext_v2i1_to_v2i32_param_1];
+; CHECK-I32X2-NEXT:    ld.param.b64 %rd1, [test_sext_v2i1_to_v2i32_param_0];
+; CHECK-I32X2-NEXT:    ld.b32 %rd4, [%rd1];
+; CHECK-I32X2-NEXT:    ld.b32 %rd5, [%rd1+4];
+; CHECK-I32X2-NEXT:    shl.b64 %rd6, %rd5, 32;
+; CHECK-I32X2-NEXT:    or.b64 %rd7, %rd6, %rd4;
+; CHECK-I32X2-NEXT:    ld.b32 %rd8, [%rd2];
+; CHECK-I32X2-NEXT:    ld.b32 %rd9, [%rd2+4];
+; CHECK-I32X2-NEXT:    shl.b64 %rd10, %rd9, 32;
+; CHECK-I32X2-NEXT:    or.b64 %rd11, %rd10, %rd8;
+; CHECK-I32X2-NEXT:    mov.b64 {%r1, %r2}, %rd11;
+; CHECK-I32X2-NEXT:    mov.b64 {%r3, %r4}, %rd7;
+; CHECK-I32X2-NEXT:    setp.gt.u32 %p1, %r3, %r1;
+; CHECK-I32X2-NEXT:    setp.gt.u32 %p2, %r4, %r2;
+; CHECK-I32X2-NEXT:    selp.b32 %r5, -1, 0, %p2;
+; CHECK-I32X2-NEXT:    selp.b32 %r6, -1, 0, %p1;
+; CHECK-I32X2-NEXT:    mov.b64 %rd12, {%r6, %r5};
+; CHECK-I32X2-NEXT:    st.b32 [%rd3], %rd12;
+; CHECK-I32X2-NEXT:    shr.u64 %rd13, %rd12, 32;
+; CHECK-I32X2-NEXT:    st.b32 [%rd3+4], %rd13;
+; CHECK-I32X2-NEXT:    ret;
+entry:
+  %t1 = load <2 x i32>, ptr %a, align 4
+  %t2 = load <2 x i32>, ptr %b, align 4
+  %t5 = icmp ugt <2 x i32> %t1, %t2
+  %t6 = sext <2 x i1> %t5 to <2 x i32>
+  store <2 x i32> %t6, ptr %c, align 4
+  ret void
+}
+
+define <2 x float> @test_uitofp_v2i32(<2 x i32> %a) {
+; CHECK-NOI32X2-LABEL: test_uitofp_v2i32(
+; CHECK-NOI32X2:       {
+; CHECK-NOI32X2-NEXT:    .reg .b32 %r<5>;
+; CHECK-NOI32X2-EMPTY:
+; CHECK-NOI32X2-NEXT:  // %bb.0:
+; CHECK-NOI32X2-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_uitofp_v2i32_param_0];
+; CHECK-NOI32X2-NEXT:    cvt.rn.f32.u32 %r3, %r2;
+; CHECK-NOI32X2-NEXT:    cvt.rn.f32.u32 %r4, %r1;
+; CHECK-NOI32X2-NEXT:    st.param.v2.b32 [func_retval0], {%r4, %r3};
+; CHECK-NOI32X2-NEXT:    ret;
+;
+; CHECK-I32X2-LABEL: test_uitofp_v2i32(
+; CHECK-I32X2:       {
+; CHECK-I32X2-NEXT:    .reg .b32 %r<5>;
+; CHECK-I32X2-NEXT:    .reg .b64 %rd<2>;
+; CHECK-I32X2-EMPTY:
+; CHECK-I32X2-NEXT:  // %bb.0:
+; CHECK-I32X2-NEXT:    ld.param.b64 %rd1, [test_uitofp_v2i32_param_0];
+; CHECK-I32X2-NEXT:    mov.b64 {%r1, %r2}, %rd1;
+; CHECK-I32X2-NEXT:    cvt.rn.f32.u32 %r3, %r2;
+; CHECK-I32X2-NEXT:    cvt.rn.f32.u32 %r4, %r1;
+; CHECK-I32X2-NEXT:    st.param.v2.b32 [func_retval0], {%r4, %r3};
+; CHECK-I32X2-NEXT:    ret;
+  %r = uitofp <2 x i32> %a to <2 x float>
+  ret <2 x float> %r
+}
+
+define <2 x float> @test_sitofp_v2i32(<2 x i32> %a) {
+; CHECK-NOI32X2-LABEL: test_sitofp_v2i32(
+; CHECK-NOI32X2:       {
+; CHECK-NOI32X2-NEXT:    .reg .b32 %r<5>;
+; CHECK-NOI32X2-EMPTY:
+; CHECK-NOI32X2-NEXT:  // %bb.0:
+; CHECK-NOI32X2-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_sitofp_v2i32_param_0];
+; CHECK-NOI32X2-NEXT:    cvt.rn.f32.s32 %r3, %r2;
+; CHECK-NOI32X2-NEXT:    cvt.rn.f32.s32 %r4, %r1;
+; CHECK-NOI32X2-NEXT:    st.param.v2.b32 [func_retval0], {%r4, %r3};
+; CHECK-NOI32X2-NEXT:    ret;
+;
+; CHECK-I32X2-LABEL: test_sitofp_v2i32(
+; CHECK-I32X2:       {
+; CHECK-I32X2-NEXT:    .reg .b32 %r<5>;
+; CHECK-I32X2-NEXT:    .reg .b64 %rd<2>;
+; CHECK-I32X2-EMPTY:
+; CHECK-I32X2-NEXT:  // %bb.0:
+; CHECK-I32X2-NEXT:    ld.param.b64 %rd1, [test_sitofp_v2i32_param_0];
+; CHECK-I32X2-NEXT:    mov.b64 {%r1, %r2}, %rd1;
+; CHECK-I32X2-NEXT:    cvt.rn.f32.s32 %r3, %r2;
+; CHECK-I32X2-NEXT:    cvt.rn.f32.s32 %r4, %r1;
+; CHECK-I32X2-NEXT:    st.param.v2.b32 [func_retval0], {%r4, %r3};
+; CHECK-I32X2-NEXT:    ret;
+  %r = sitofp <2 x i32> %a to <2 x float>
+  ret <2 x float> %r
+}
+
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/RISCV/features-info.ll b/llvm/test/CodeGen/RISCV/features-info.ll
index 1a7a72d..693a40d 100644
--- a/llvm/test/CodeGen/RISCV/features-info.ll
+++ b/llvm/test/CodeGen/RISCV/features-info.ll
@@ -142,6 +142,7 @@
 ; CHECK-NEXT:   shvstvecd                        - 'Shvstvecd' (vstvec supports Direct mode).
 ; CHECK-NEXT:   shxadd-load-fusion               - Enable SH(1|2|3)ADD(.UW) + load macrofusion.
 ; CHECK-NEXT:   sifive7                          - SiFive 7-Series processors.
+; CHECK-NEXT:   single-element-vec-fp64          - Certain vector FP64 operations produce a single result element per cycle.
 ; CHECK-NEXT:   smaia                            - 'Smaia' (Advanced Interrupt Architecture Machine Level).
 ; CHECK-NEXT:   smcdeleg                         - 'Smcdeleg' (Counter Delegation Machine Level).
 ; CHECK-NEXT:   smcntrpmf                        - 'Smcntrpmf' (Cycle and Instret Privilege Mode Filtering).
diff --git a/llvm/test/CodeGen/X86/fmaxnum.ll b/llvm/test/CodeGen/X86/fmaxnum.ll
index d6252cc..150bef0 100644
--- a/llvm/test/CodeGen/X86/fmaxnum.ll
+++ b/llvm/test/CodeGen/X86/fmaxnum.ll
@@ -645,11 +645,47 @@ define float @test_maxnum_const_op2(float %x) {
   ret float %r
 }
 
-define float @test_maxnum_const_nan(float %x) {
-; CHECK-LABEL: test_maxnum_const_nan:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    retq
-  %r = call float @llvm.maxnum.f32(float %x, float 0x7fff000000000000)
+define float @test_maxnum_const_nan(float %x, float %y) {
+; SSE-LABEL: test_maxnum_const_nan:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movaps %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test_maxnum_const_nan:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovaps %xmm1, %xmm0
+; AVX-NEXT:    retq
+  %r = call float @llvm.maxnum.f32(float %y, float 0x7fff000000000000)
+  ret float %r
+}
+
+; nnan maxnum(Y, -inf) -> Y
+define float @test_maxnum_neg_inf_nnan(float %x, float %y) nounwind {
+; SSE-LABEL: test_maxnum_neg_inf_nnan:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movaps %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test_maxnum_neg_inf_nnan:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovaps %xmm1, %xmm0
+; AVX-NEXT:    retq
+  %r = call nnan float @llvm.maxnum.f32(float %y, float 0xfff0000000000000)
+  ret float %r
+}
+
+; Test SNaN quieting
+define float @test_maxnum_snan(float %x) {
+; SSE-LABEL: test_maxnum_snan:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movss {{.*#+}} xmm0 = [NaN,0.0E+0,0.0E+0,0.0E+0]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test_maxnum_snan:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = [NaN,0.0E+0,0.0E+0,0.0E+0]
+; AVX-NEXT:    retq
+  %r = call float @llvm.maxnum.f32(float 0x7ff4000000000000, float %x)
   ret float %r
 }
 
diff --git a/llvm/test/CodeGen/X86/fminimum-fmaximum.ll b/llvm/test/CodeGen/X86/fminimum-fmaximum.ll
index 864c233..06515e4 100644
--- a/llvm/test/CodeGen/X86/fminimum-fmaximum.ll
+++ b/llvm/test/CodeGen/X86/fminimum-fmaximum.ll
@@ -2649,3 +2649,102 @@ define <4 x bfloat> @test_fmaximum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) {
   %r = call <4 x bfloat> @llvm.maximum.v4bf16(<4 x bfloat> %x, <4 x bfloat> %y)
   ret <4 x bfloat> %r
 }
+
+; nnan minimum(Y, +inf) -> Y
+define float @test_fminimum_inf_nnan(float %x, float %y) nounwind {
+; SSE2-LABEL: test_fminimum_inf_nnan:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test_fminimum_inf_nnan:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovaps %xmm1, %xmm0
+; AVX-NEXT:    retq
+;
+; AVX10_2-LABEL: test_fminimum_inf_nnan:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vmovaps %xmm1, %xmm0
+; AVX10_2-NEXT:    retq
+;
+; X86-LABEL: test_fminimum_inf_nnan:
+; X86:       # %bb.0:
+; X86-NEXT:    flds {{[0-9]+}}(%esp)
+; X86-NEXT:    retl
+  %1 = call nnan float @llvm.minimum.f32(float %y, float 0x7ff0000000000000)
+  ret float %1
+}
+
+; nnan maximum(Y, -inf) -> Y
+define float @test_fmaximum_neg_inf_nnan(float %x, float %y) nounwind {
+; SSE2-LABEL: test_fmaximum_neg_inf_nnan:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test_fmaximum_neg_inf_nnan:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovaps %xmm1, %xmm0
+; AVX-NEXT:    retq
+;
+; AVX10_2-LABEL: test_fmaximum_neg_inf_nnan:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vmovaps %xmm1, %xmm0
+; AVX10_2-NEXT:    retq
+;
+; X86-LABEL: test_fmaximum_neg_inf_nnan:
+; X86:       # %bb.0:
+; X86-NEXT:    flds {{[0-9]+}}(%esp)
+; X86-NEXT:    retl
+  %1 = call nnan float @llvm.maximum.f32(float %y, float 0xfff0000000000000)
+  ret float %1
+}
+
+; Test SNaN quieting
+define float @test_fmaximum_snan(float %x) {
+; SSE2-LABEL: test_fmaximum_snan:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movss {{.*#+}} xmm0 = [NaN,0.0E+0,0.0E+0,0.0E+0]
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test_fmaximum_snan:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = [NaN,0.0E+0,0.0E+0,0.0E+0]
+; AVX-NEXT:    retq
+;
+; AVX10_2-LABEL: test_fmaximum_snan:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vmovss {{.*#+}} xmm0 = [NaN,0.0E+0,0.0E+0,0.0E+0]
+; AVX10_2-NEXT:    retq
+;
+; X86-LABEL: test_fmaximum_snan:
+; X86:       # %bb.0:
+; X86-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
+; X86-NEXT:    retl
+  %1 = tail call float @llvm.maximum.f32(float 0x7ff4000000000000, float %x)
+  ret float %1
+}
+
+define float @test_fminimum_snan(float %x) {
+; SSE2-LABEL: test_fminimum_snan:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movss {{.*#+}} xmm0 = [NaN,0.0E+0,0.0E+0,0.0E+0]
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test_fminimum_snan:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = [NaN,0.0E+0,0.0E+0,0.0E+0]
+; AVX-NEXT:    retq
+;
+; AVX10_2-LABEL: test_fminimum_snan:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vmovss {{.*#+}} xmm0 = [NaN,0.0E+0,0.0E+0,0.0E+0]
+; AVX10_2-NEXT:    retq
+;
+; X86-LABEL: test_fminimum_snan:
+; X86:       # %bb.0:
+; X86-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
+; X86-NEXT:    retl
+  %1 = tail call float @llvm.minimum.f32(float 0x7ff4000000000000, float %x)
+  ret float %1
+}
diff --git a/llvm/test/CodeGen/X86/fminimumnum-fmaximumnum.ll b/llvm/test/CodeGen/X86/fminimumnum-fmaximumnum.ll
index c66473e..0fe107c 100644
--- a/llvm/test/CodeGen/X86/fminimumnum-fmaximumnum.ll
+++ b/llvm/test/CodeGen/X86/fminimumnum-fmaximumnum.ll
@@ -2479,3 +2479,102 @@ define <4 x bfloat> @test_fmaximumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) n
   %r = call <4 x bfloat> @llvm.maximumnum.v4bf16(<4 x bfloat> %x, <4 x bfloat> %y)
   ret <4 x bfloat> %r
 }
+
+; nnan minimumnum(Y, +inf) -> Y
+define float @test_fminimumnum_inf_nnan(float %x, float %y) nounwind {
+; SSE2-LABEL: test_fminimumnum_inf_nnan:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test_fminimumnum_inf_nnan:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovaps %xmm1, %xmm0
+; AVX-NEXT:    retq
+;
+; AVX10_2-LABEL: test_fminimumnum_inf_nnan:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vmovaps %xmm1, %xmm0
+; AVX10_2-NEXT:    retq
+;
+; X86-LABEL: test_fminimumnum_inf_nnan:
+; X86:       # %bb.0:
+; X86-NEXT:    flds {{[0-9]+}}(%esp)
+; X86-NEXT:    retl
+  %1 = call nnan float @llvm.minimumnum.f32(float %y, float 0x7ff0000000000000)
+  ret float %1
+}
+
+; nnan maximumnum(Y, -inf) -> Y
+define float @test_fmaximumnum_neg_inf_nnan(float %x, float %y) nounwind {
+; SSE2-LABEL: test_fmaximumnum_neg_inf_nnan:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test_fmaximumnum_neg_inf_nnan:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovaps %xmm1, %xmm0
+; AVX-NEXT:    retq
+;
+; AVX10_2-LABEL: test_fmaximumnum_neg_inf_nnan:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vmovaps %xmm1, %xmm0
+; AVX10_2-NEXT:    retq
+;
+; X86-LABEL: test_fmaximumnum_neg_inf_nnan:
+; X86:       # %bb.0:
+; X86-NEXT:    flds {{[0-9]+}}(%esp)
+; X86-NEXT:    retl
+  %1 = call nnan float @llvm.maximumnum.f32(float %y, float 0xfff0000000000000)
+  ret float %1
+}
+
+; Test we propagate the non-NaN arg, even if one arg is SNaN
+define float @test_fmaximumnum_snan(float %x, float %y) {
+; SSE2-LABEL: test_fmaximumnum_snan:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test_fmaximumnum_snan:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovaps %xmm1, %xmm0
+; AVX-NEXT:    retq
+;
+; AVX10_2-LABEL: test_fmaximumnum_snan:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vmovaps %xmm1, %xmm0
+; AVX10_2-NEXT:    retq
+;
+; X86-LABEL: test_fmaximumnum_snan:
+; X86:       # %bb.0:
+; X86-NEXT:    flds {{[0-9]+}}(%esp)
+; X86-NEXT:    retl
+  %1 = tail call float @llvm.maximumnum.f32(float 0x7ff4000000000000, float %y)
+  ret float %1
+}
+
+define float @test_fminimumnum_snan(float %x, float %y) {
+; SSE2-LABEL: test_fminimumnum_snan:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test_fminimumnum_snan:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovaps %xmm1, %xmm0
+; AVX-NEXT:    retq
+;
+; AVX10_2-LABEL: test_fminimumnum_snan:
+; AVX10_2:       # %bb.0:
+; AVX10_2-NEXT:    vmovaps %xmm1, %xmm0
+; AVX10_2-NEXT:    retq
+;
+; X86-LABEL: test_fminimumnum_snan:
+; X86:       # %bb.0:
+; X86-NEXT:    flds {{[0-9]+}}(%esp)
+; X86-NEXT:    retl
+  %1 = tail call float @llvm.minimumnum.f32(float 0x7ff4000000000000, float %y)
+  ret float %1
+}
diff --git a/llvm/test/CodeGen/X86/fminnum.ll b/llvm/test/CodeGen/X86/fminnum.ll
index 0ef8fde..4aa1a61 100644
--- a/llvm/test/CodeGen/X86/fminnum.ll
+++ b/llvm/test/CodeGen/X86/fminnum.ll
@@ -645,11 +645,47 @@ define float @test_minnum_const_op2(float %x) {
   ret float %r
 }
 
-define float @test_minnum_const_nan(float %x) {
-; CHECK-LABEL: test_minnum_const_nan:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    retq
-  %r = call float @llvm.minnum.f32(float %x, float 0x7fff000000000000)
+define float @test_minnum_const_nan(float %x, float %y) {
+; SSE-LABEL: test_minnum_const_nan:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movaps %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test_minnum_const_nan:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovaps %xmm1, %xmm0
+; AVX-NEXT:    retq
+  %r = call float @llvm.minnum.f32(float %y, float 0x7fff000000000000)
+  ret float %r
+}
+
+; nnan minnum(Y, +inf) -> Y
+define float @test_minnum_inf_nnan(float %x, float %y) nounwind {
+; SSE-LABEL: test_minnum_inf_nnan:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movaps %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test_minnum_inf_nnan:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovaps %xmm1, %xmm0
+; AVX-NEXT:    retq
+  %r = call nnan float @llvm.minnum.f32(float %y, float 0x7ff0000000000000)
+  ret float %r
+}
+
+; Test SNaN quieting
+define float @test_minnum_snan(float %x) {
+; SSE-LABEL: test_minnum_snan:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movss {{.*#+}} xmm0 = [NaN,0.0E+0,0.0E+0,0.0E+0]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test_minnum_snan:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = [NaN,0.0E+0,0.0E+0,0.0E+0]
+; AVX-NEXT:    retq
+  %r = call float @llvm.minnum.f32(float 0x7ff4000000000000, float %x)
   ret float %r
 }
 
diff --git a/llvm/test/CodeGen/X86/pgo-profile-o0.ll b/llvm/test/CodeGen/X86/pgo-profile-o0.ll
new file mode 100644
index 0000000..f9704fc
--- /dev/null
+++ b/llvm/test/CodeGen/X86/pgo-profile-o0.ll
@@ -0,0 +1,49 @@
+; RUN: llc -mtriple=x86_64-- -O0 -pgo-kind=pgo-sample-use-pipeline -debug-pass=Structure %s -o /dev/null 2>&1 | FileCheck %s --check-prefix=PASSES
+; RUN: llc -mtriple=x86_64-- -O0 -pgo-kind=pgo-sample-use-pipeline -debug-only=branch-prob %s -o /dev/null 2>&1 | FileCheck %s --check-prefix=BRANCH_PROB
+; RUN: llc -mtriple=x86_64-- -O0 -pgo-kind=pgo-sample-use-pipeline -stop-after=finalize-isel %s -o - | FileCheck %s --check-prefix=MIR
+
+; REQUIRES: asserts
+
+; This test verifies that PGO profile information (branch weights) is preserved
+; during instruction selection at -O0.
+
+; Test function with explicit branch weights from PGO.
+define i32 @test_pgo_preservation(i32 %x) !prof !15 {
+entry:
+  %cmp = icmp sgt i32 %x, 10
+  ; This branch has bias: 97 taken vs 3 not taken
+  br i1 %cmp, label %if.then, label %if.else, !prof !16
+
+if.then:
+  ; Hot path - should have high frequency
+  %add = add nsw i32 %x, 100
+  br label %if.end
+
+if.else:
+  ; Cold path - should have low frequency
+  %sub = sub nsw i32 %x, 50
+  br label %if.end
+
+if.end:
+  %result = phi i32 [ %add, %if.then ], [ %sub, %if.else ]
+  ret i32 %result
+}
+
+; Profile metadata with branch weights 97:3.
+!15 = !{!"function_entry_count", i64 100}
+!16 = !{!"branch_weights", i32 97, i32 3}
+
+; Verify that Branch Probability Analysis runs at O0.
+; PASSES: Branch Probability Analysis
+
+; Verify that the branch probabilities reflect the exact profile data.
+; BRANCH_PROB: ---- Branch Probability Info : test_pgo_preservation ----
+; BRANCH_PROB: set edge entry -> 0 successor probability to {{.*}} = 97.00%
+; BRANCH_PROB: set edge entry -> 1 successor probability to {{.*}} = 3.00%
+
+; Verify that machine IR preserves the branch probabilities from profile data
+; MIR: bb.0.entry:
+; MIR-NEXT: successors: %bb.{{[0-9]+}}({{0x03d70a3d|0x7c28f5c3}}), %bb.{{[0-9]+}}({{0x7c28f5c3|0x03d70a3d}})
+; The two successor probability values should be:
+; - 0x7c28f5c3: approximately 97% (high probability successor)
+; - 0x03d70a3d: approximately 3% (low probability successor)
diff --git a/llvm/test/Transforms/AggressiveInstCombine/trunc_select.ll b/llvm/test/Transforms/AggressiveInstCombine/trunc_select.ll
index fb14782..9352211 100644
--- a/llvm/test/Transforms/AggressiveInstCombine/trunc_select.ll
+++ b/llvm/test/Transforms/AggressiveInstCombine/trunc_select.ll
@@ -3,16 +3,17 @@
 
 target datalayout = "e-m:m-p1:64:64:64-p:32:32:32-n8:16:32"
 
-define dso_local i16 @select_i16(i16 %a, i16 %b, i1 %cond) {
+define dso_local i16 @select_i16(i16 %a, i16 %b, i1 %cond) !prof !0 {
 ; CHECK-LABEL: @select_i16(
+; CHECK: !prof [[PROF_0:![0-9]+]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[COND:%.*]], i16 [[A:%.*]], i16 [[B:%.*]]
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[COND:%.*]], i16 [[A:%.*]], i16 [[B:%.*]], !prof [[PROF_1:![0-9]+]]
 ; CHECK-NEXT:    ret i16 [[SEL]]
 ;
 entry:
   %conv0 = sext i16 %a to i32
   %conv1 = sext i16 %b to i32
-  %sel = select i1 %cond, i32 %conv0, i32 %conv1
+  %sel = select i1 %cond, i32 %conv0, i32 %conv1, !prof !1
   %conv4 = trunc i32 %sel to i16
   ret i16 %conv4
 }
@@ -134,3 +135,8 @@ entry:
   ret i16 %conv4
 }
 
+!0 = !{!"function_entry_count", i64 1000}
+!1 = !{!"branch_weights", i32 2, i32 3}
+; CHECK: [[PROF_0]] = !{!"function_entry_count", i64 1000}
+; CHECK: [[PROF_1]] = !{!"branch_weights", i32 2, i32 3}
+
diff --git a/llvm/test/Transforms/AggressiveInstCombine/trunc_select_cmp.ll b/llvm/test/Transforms/AggressiveInstCombine/trunc_select_cmp.ll
index ac9cf2d..69ad625 100644
--- a/llvm/test/Transforms/AggressiveInstCombine/trunc_select_cmp.ll
+++ b/llvm/test/Transforms/AggressiveInstCombine/trunc_select_cmp.ll
@@ -1,19 +1,20 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -passes=aggressive-instcombine -S | FileCheck %s
 
-define dso_local i16 @cmp_select_sext_const(i8 %a) {
+define dso_local i16 @cmp_select_sext_const(i8 %a) !prof !0 {
 ; CHECK-LABEL: @cmp_select_sext_const(
+; CHECK: !prof [[PROF_0:![0-9]+]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[CONV:%.*]] = sext i8 [[A:%.*]] to i32
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CONV]], 109
-; CHECK-NEXT:    [[COND:%.*]] = select i1 [[CMP]], i32 109, i32 [[CONV]]
+; CHECK-NEXT:    [[COND:%.*]] = select i1 [[CMP]], i32 109, i32 [[CONV]], !prof [[PROF_1:![0-9]+]]
 ; CHECK-NEXT:    [[CONV4:%.*]] = trunc i32 [[COND]] to i16
 ; CHECK-NEXT:    ret i16 [[CONV4]]
 ;
 entry:
   %conv = sext i8 %a to i32
   %cmp = icmp slt i32 %conv, 109
-  %cond = select i1 %cmp, i32 109, i32 %conv
+  %cond = select i1 %cmp, i32 109, i32 %conv, !prof !1
   %conv4 = trunc i32 %cond to i16
   ret i16 %conv4
 }
@@ -209,3 +210,7 @@ define i16 @cmp_select_unsigned_const_i16Const_noTransformation(i8 %a) {
   ret i16 %conv4
 }
 
+!0 = !{!"function_entry_count", i64 1000}
+!1 = !{!"branch_weights", i32 2, i32 3}
+; CHECK: [[PROF_0]] = !{!"function_entry_count", i64 1000}
+; CHECK: [[PROF_1]] = !{!"branch_weights", i32 2, i32 3}
diff --git a/llvm/test/Transforms/Coroutines/coro-transform-must-elide.ll b/llvm/test/Transforms/Coroutines/coro-elide-safe.ll
index 4eec7ed..722693d 100644
--- a/llvm/test/Transforms/Coroutines/coro-transform-must-elide.ll
+++ b/llvm/test/Transforms/Coroutines/coro-elide-safe.ll
@@ -1,4 +1,8 @@
-; Testing elide performed its job for calls to coroutines marked safe.
+; Coroutine calls marked with `coro_elide_safe` should be elided.
+; Inside `caller`, we expect the `callee` coroutine to be elided.
+; Inside `caller_conditional`, `callee` is only called on an unlikely
+; path, hence we expect the `callee` coroutine NOT to be elided.
+;
 ; RUN: opt < %s -S -passes='cgscc(coro-annotation-elide)' | FileCheck %s
 
 %struct.Task = type { ptr }
@@ -57,7 +61,7 @@ define ptr @callee.noalloc(i8 %arg, ptr dereferenceable(32) align(8) %frame) {
 ; Function Attrs: presplitcoroutine
 define ptr @caller() #0 {
 entry:
-  %task = call ptr @callee(i8 0) #1
+  %task = call ptr @callee(i8 0) coro_elide_safe
   ret ptr %task
   ; CHECK: %[[TASK:.+]] = alloca %struct.Task, align 8
   ; CHECK-NEXT: %[[FRAME:.+]] = alloca [32 x i8], align 8
@@ -69,6 +73,25 @@ entry:
   ; CHECK-NEXT: ret ptr %[[TASK]]
 }
 
+; CHECK-LABEL: define ptr @caller_conditional(i1 %cond)
+; Function Attrs: presplitcoroutine
+define ptr @caller_conditional(i1 %cond) #0 {
+entry:
+  br i1 %cond, label %call, label %ret
+
+call:
+  ; CHECK-NOT: alloca
+  ; CHECK-NOT: @llvm.coro.id({{.*}}, ptr @callee, {{.*}})
+  ; CHECK: %task = call ptr @callee(i8 0)
+  ; CHECK-NEXT: br label %ret
+  %task = call ptr @callee(i8 0) coro_elide_safe
+  br label %ret
+
+ret:
+  %retval = phi ptr [ %task, %call ], [ null, %entry ]
+  ret ptr %retval
+}
+
 declare token @llvm.coro.id(i32, ptr, ptr, ptr)
 declare ptr @llvm.coro.begin(token, ptr)
 declare ptr @llvm.coro.frame()
@@ -76,4 +99,3 @@ declare ptr @llvm.coro.subfn.addr(ptr, i8)
 declare i1 @llvm.coro.alloc(token)
 
 attributes #0 = { presplitcoroutine }
-attributes #1 = { coro_elide_safe }
diff --git a/llvm/test/Transforms/LoopVectorize/X86/replicating-load-store-costs.ll b/llvm/test/Transforms/LoopVectorize/X86/replicating-load-store-costs.ll
index c225ede5..65058bd 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/replicating-load-store-costs.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/replicating-load-store-costs.ll
@@ -621,8 +621,6 @@ define void @loaded_address_used_by_load_through_blend(i64 %start, ptr noalias %
 ; I32-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP1]], 8
 ; I32-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP1]], [[N_MOD_VF]]
 ; I32-NEXT:    [[TMP2:%.*]] = sub i64 [[START]], [[N_VEC]]
-; I32-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i64> poison, i64 [[START]], i64 0
-; I32-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT]], <8 x i64> poison, <8 x i32> zeroinitializer
 ; I32-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x ptr> poison, ptr [[SRC_2]], i64 0
 ; I32-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <8 x ptr> [[BROADCAST_SPLATINSERT1]], <8 x ptr> poison, <8 x i32> zeroinitializer
 ; I32-NEXT:    br label %[[VECTOR_BODY:.*]]
@@ -644,14 +642,6 @@ define void @loaded_address_used_by_load_through_blend(i64 %start, ptr noalias %
 ; I32-NEXT:    [[TMP16:%.*]] = add i64 [[TMP8]], 1
 ; I32-NEXT:    [[TMP17:%.*]] = add i64 [[TMP9]], 1
 ; I32-NEXT:    [[TMP18:%.*]] = add i64 [[TMP10]], 1
-; I32-NEXT:    [[TMP19:%.*]] = insertelement <8 x i64> poison, i64 [[TMP11]], i32 0
-; I32-NEXT:    [[TMP20:%.*]] = insertelement <8 x i64> [[TMP19]], i64 [[TMP12]], i32 1
-; I32-NEXT:    [[TMP21:%.*]] = insertelement <8 x i64> [[TMP20]], i64 [[TMP13]], i32 2
-; I32-NEXT:    [[TMP22:%.*]] = insertelement <8 x i64> [[TMP21]], i64 [[TMP14]], i32 3
-; I32-NEXT:    [[TMP23:%.*]] = insertelement <8 x i64> [[TMP22]], i64 [[TMP15]], i32 4
-; I32-NEXT:    [[TMP24:%.*]] = insertelement <8 x i64> [[TMP23]], i64 [[TMP16]], i32 5
-; I32-NEXT:    [[TMP25:%.*]] = insertelement <8 x i64> [[TMP24]], i64 [[TMP17]], i32 6
-; I32-NEXT:    [[TMP26:%.*]] = insertelement <8 x i64> [[TMP25]], i64 [[TMP18]], i32 7
 ; I32-NEXT:    [[TMP27:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP11]]
 ; I32-NEXT:    [[TMP28:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP12]]
 ; I32-NEXT:    [[TMP29:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP13]]
@@ -677,22 +667,21 @@ define void @loaded_address_used_by_load_through_blend(i64 %start, ptr noalias %
 ; I32-NEXT:    [[TMP49:%.*]] = insertelement <8 x float> [[TMP48]], float [[TMP41]], i32 6
 ; I32-NEXT:    [[TMP50:%.*]] = insertelement <8 x float> [[TMP49]], float [[TMP42]], i32 7
 ; I32-NEXT:    [[TMP51:%.*]] = fcmp oeq <8 x float> [[TMP50]], zeroinitializer
-; I32-NEXT:    [[TMP52:%.*]] = mul <8 x i64> [[TMP26]], [[BROADCAST_SPLAT]]
-; I32-NEXT:    [[TMP53:%.*]] = extractelement <8 x i64> [[TMP52]], i32 0
+; I32-NEXT:    [[TMP53:%.*]] = mul i64 [[TMP11]], [[START]]
+; I32-NEXT:    [[TMP55:%.*]] = mul i64 [[TMP12]], [[START]]
+; I32-NEXT:    [[TMP57:%.*]] = mul i64 [[TMP13]], [[START]]
+; I32-NEXT:    [[TMP59:%.*]] = mul i64 [[TMP14]], [[START]]
+; I32-NEXT:    [[TMP61:%.*]] = mul i64 [[TMP15]], [[START]]
+; I32-NEXT:    [[TMP63:%.*]] = mul i64 [[TMP16]], [[START]]
+; I32-NEXT:    [[TMP65:%.*]] = mul i64 [[TMP17]], [[START]]
+; I32-NEXT:    [[TMP67:%.*]] = mul i64 [[TMP18]], [[START]]
 ; I32-NEXT:    [[TMP54:%.*]] = getelementptr i8, ptr [[SRC_2]], i64 [[TMP53]]
-; I32-NEXT:    [[TMP55:%.*]] = extractelement <8 x i64> [[TMP52]], i32 1
 ; I32-NEXT:    [[TMP56:%.*]] = getelementptr i8, ptr [[SRC_2]], i64 [[TMP55]]
-; I32-NEXT:    [[TMP57:%.*]] = extractelement <8 x i64> [[TMP52]], i32 2
 ; I32-NEXT:    [[TMP58:%.*]] = getelementptr i8, ptr [[SRC_2]], i64 [[TMP57]]
-; I32-NEXT:    [[TMP59:%.*]] = extractelement <8 x i64> [[TMP52]], i32 3
 ; I32-NEXT:    [[TMP60:%.*]] = getelementptr i8, ptr [[SRC_2]], i64 [[TMP59]]
-; I32-NEXT:    [[TMP61:%.*]] = extractelement <8 x i64> [[TMP52]], i32 4
 ; I32-NEXT:    [[TMP62:%.*]] = getelementptr i8, ptr [[SRC_2]], i64 [[TMP61]]
-; I32-NEXT:    [[TMP63:%.*]] = extractelement <8 x i64> [[TMP52]], i32 5
 ; I32-NEXT:    [[TMP64:%.*]] = getelementptr i8, ptr [[SRC_2]], i64 [[TMP63]]
-; I32-NEXT:    [[TMP65:%.*]] = extractelement <8 x i64> [[TMP52]], i32 6
 ; I32-NEXT:    [[TMP66:%.*]] = getelementptr i8, ptr [[SRC_2]], i64 [[TMP65]]
-; I32-NEXT:    [[TMP67:%.*]] = extractelement <8 x i64> [[TMP52]], i32 7
 ; I32-NEXT:    [[TMP68:%.*]] = getelementptr i8, ptr [[SRC_2]], i64 [[TMP67]]
 ; I32-NEXT:    [[TMP69:%.*]] = insertelement <8 x ptr> poison, ptr [[TMP54]], i32 0
 ; I32-NEXT:    [[TMP70:%.*]] = insertelement <8 x ptr> [[TMP69]], ptr [[TMP56]], i32 1
@@ -774,7 +763,222 @@ exit:
   ret void
 }
 
-attributes #0 = { "target-cpu"="znver3" }
+define void @address_use_in_different_block(ptr noalias %dst, ptr %src.0, ptr %src.1, i32 %x) #0 {
+; I64-LABEL: define void @address_use_in_different_block(
+; I64-SAME: ptr noalias [[DST:%.*]], ptr [[SRC_0:%.*]], ptr [[SRC_1:%.*]], i32 [[X:%.*]]) #[[ATTR0]] {
+; I64-NEXT:  [[ENTRY:.*:]]
+; I64-NEXT:    [[X_POS:%.*]] = call i32 @llvm.smax.i32(i32 [[X]], i32 0)
+; I64-NEXT:    [[OFFSET:%.*]] = zext i32 [[X_POS]] to i64
+; I64-NEXT:    br label %[[VECTOR_PH:.*]]
+; I64:       [[VECTOR_PH]]:
+; I64-NEXT:    br label %[[VECTOR_BODY:.*]]
+; I64:       [[VECTOR_BODY]]:
+; I64-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; I64-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; I64-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
+; I64-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 2
+; I64-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 3
+; I64-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 4
+; I64-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 5
+; I64-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 6
+; I64-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], 7
+; I64-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP0]], [[OFFSET]]
+; I64-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP1]], [[OFFSET]]
+; I64-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP2]], [[OFFSET]]
+; I64-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP3]], [[OFFSET]]
+; I64-NEXT:    [[TMP12:%.*]] = mul i64 [[TMP4]], [[OFFSET]]
+; I64-NEXT:    [[TMP13:%.*]] = mul i64 [[TMP5]], [[OFFSET]]
+; I64-NEXT:    [[TMP14:%.*]] = mul i64 [[TMP6]], [[OFFSET]]
+; I64-NEXT:    [[TMP15:%.*]] = mul i64 [[TMP7]], [[OFFSET]]
+; I64-NEXT:    [[TMP16:%.*]] = getelementptr i32, ptr [[SRC_0]], i64 [[TMP8]]
+; I64-NEXT:    [[TMP17:%.*]] = getelementptr i32, ptr [[SRC_0]], i64 [[TMP9]]
+; I64-NEXT:    [[TMP18:%.*]] = getelementptr i32, ptr [[SRC_0]], i64 [[TMP10]]
+; I64-NEXT:    [[TMP19:%.*]] = getelementptr i32, ptr [[SRC_0]], i64 [[TMP11]]
+; I64-NEXT:    [[TMP20:%.*]] = getelementptr i32, ptr [[SRC_0]], i64 [[TMP12]]
+; I64-NEXT:    [[TMP21:%.*]] = getelementptr i32, ptr [[SRC_0]], i64 [[TMP13]]
+; I64-NEXT:    [[TMP22:%.*]] = getelementptr i32, ptr [[SRC_0]], i64 [[TMP14]]
+; I64-NEXT:    [[TMP23:%.*]] = getelementptr i32, ptr [[SRC_0]], i64 [[TMP15]]
+; I64-NEXT:    [[TMP24:%.*]] = load i32, ptr [[TMP16]], align 4
+; I64-NEXT:    [[TMP25:%.*]] = load i32, ptr [[TMP17]], align 4
+; I64-NEXT:    [[TMP26:%.*]] = load i32, ptr [[TMP18]], align 4
+; I64-NEXT:    [[TMP27:%.*]] = load i32, ptr [[TMP19]], align 4
+; I64-NEXT:    [[TMP28:%.*]] = load i32, ptr [[TMP20]], align 4
+; I64-NEXT:    [[TMP29:%.*]] = load i32, ptr [[TMP21]], align 4
+; I64-NEXT:    [[TMP30:%.*]] = load i32, ptr [[TMP22]], align 4
+; I64-NEXT:    [[TMP31:%.*]] = load i32, ptr [[TMP23]], align 4
+; I64-NEXT:    [[TMP32:%.*]] = sext i32 [[TMP24]] to i64
+; I64-NEXT:    [[TMP33:%.*]] = sext i32 [[TMP25]] to i64
+; I64-NEXT:    [[TMP34:%.*]] = sext i32 [[TMP26]] to i64
+; I64-NEXT:    [[TMP35:%.*]] = sext i32 [[TMP27]] to i64
+; I64-NEXT:    [[TMP36:%.*]] = sext i32 [[TMP28]] to i64
+; I64-NEXT:    [[TMP37:%.*]] = sext i32 [[TMP29]] to i64
+; I64-NEXT:    [[TMP38:%.*]] = sext i32 [[TMP30]] to i64
+; I64-NEXT:    [[TMP39:%.*]] = sext i32 [[TMP31]] to i64
+; I64-NEXT:    [[TMP40:%.*]] = getelementptr double, ptr [[SRC_1]], i64 [[TMP32]]
+; I64-NEXT:    [[TMP41:%.*]] = getelementptr double, ptr [[SRC_1]], i64 [[TMP33]]
+; I64-NEXT:    [[TMP42:%.*]] = getelementptr double, ptr [[SRC_1]], i64 [[TMP34]]
+; I64-NEXT:    [[TMP43:%.*]] = getelementptr double, ptr [[SRC_1]], i64 [[TMP35]]
+; I64-NEXT:    [[TMP44:%.*]] = getelementptr double, ptr [[SRC_1]], i64 [[TMP36]]
+; I64-NEXT:    [[TMP45:%.*]] = getelementptr double, ptr [[SRC_1]], i64 [[TMP37]]
+; I64-NEXT:    [[TMP46:%.*]] = getelementptr double, ptr [[SRC_1]], i64 [[TMP38]]
+; I64-NEXT:    [[TMP47:%.*]] = getelementptr double, ptr [[SRC_1]], i64 [[TMP39]]
+; I64-NEXT:    [[TMP48:%.*]] = getelementptr i8, ptr [[TMP40]], i64 -8
+; I64-NEXT:    [[TMP49:%.*]] = getelementptr i8, ptr [[TMP41]], i64 -8
+; I64-NEXT:    [[TMP50:%.*]] = getelementptr i8, ptr [[TMP42]], i64 -8
+; I64-NEXT:    [[TMP51:%.*]] = getelementptr i8, ptr [[TMP43]], i64 -8
+; I64-NEXT:    [[TMP52:%.*]] = getelementptr i8, ptr [[TMP44]], i64 -8
+; I64-NEXT:    [[TMP53:%.*]] = getelementptr i8, ptr [[TMP45]], i64 -8
+; I64-NEXT:    [[TMP54:%.*]] = getelementptr i8, ptr [[TMP46]], i64 -8
+; I64-NEXT:    [[TMP55:%.*]] = getelementptr i8, ptr [[TMP47]], i64 -8
+; I64-NEXT:    [[TMP56:%.*]] = load double, ptr [[TMP48]], align 8
+; I64-NEXT:    [[TMP57:%.*]] = load double, ptr [[TMP49]], align 8
+; I64-NEXT:    [[TMP58:%.*]] = insertelement <2 x double> poison, double [[TMP56]], i32 0
+; I64-NEXT:    [[TMP59:%.*]] = insertelement <2 x double> [[TMP58]], double [[TMP57]], i32 1
+; I64-NEXT:    [[TMP60:%.*]] = load double, ptr [[TMP50]], align 8
+; I64-NEXT:    [[TMP61:%.*]] = load double, ptr [[TMP51]], align 8
+; I64-NEXT:    [[TMP62:%.*]] = insertelement <2 x double> poison, double [[TMP60]], i32 0
+; I64-NEXT:    [[TMP63:%.*]] = insertelement <2 x double> [[TMP62]], double [[TMP61]], i32 1
+; I64-NEXT:    [[TMP64:%.*]] = load double, ptr [[TMP52]], align 8
+; I64-NEXT:    [[TMP65:%.*]] = load double, ptr [[TMP53]], align 8
+; I64-NEXT:    [[TMP66:%.*]] = insertelement <2 x double> poison, double [[TMP64]], i32 0
+; I64-NEXT:    [[TMP67:%.*]] = insertelement <2 x double> [[TMP66]], double [[TMP65]], i32 1
+; I64-NEXT:    [[TMP68:%.*]] = load double, ptr [[TMP54]], align 8
+; I64-NEXT:    [[TMP69:%.*]] = load double, ptr [[TMP55]], align 8
+; I64-NEXT:    [[TMP70:%.*]] = insertelement <2 x double> poison, double [[TMP68]], i32 0
+; I64-NEXT:    [[TMP71:%.*]] = insertelement <2 x double> [[TMP70]], double [[TMP69]], i32 1
+; I64-NEXT:    [[TMP72:%.*]] = fsub <2 x double> zeroinitializer, [[TMP59]]
+; I64-NEXT:    [[TMP73:%.*]] = fsub <2 x double> zeroinitializer, [[TMP63]]
+; I64-NEXT:    [[TMP74:%.*]] = fsub <2 x double> zeroinitializer, [[TMP67]]
+; I64-NEXT:    [[TMP75:%.*]] = fsub <2 x double> zeroinitializer, [[TMP71]]
+; I64-NEXT:    [[TMP76:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP8]]
+; I64-NEXT:    [[TMP77:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP9]]
+; I64-NEXT:    [[TMP78:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP10]]
+; I64-NEXT:    [[TMP79:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP11]]
+; I64-NEXT:    [[TMP80:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP12]]
+; I64-NEXT:    [[TMP81:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP13]]
+; I64-NEXT:    [[TMP82:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP14]]
+; I64-NEXT:    [[TMP83:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP15]]
+; I64-NEXT:    [[TMP84:%.*]] = extractelement <2 x double> [[TMP72]], i32 0
+; I64-NEXT:    store double [[TMP84]], ptr [[TMP76]], align 8
+; I64-NEXT:    [[TMP85:%.*]] = extractelement <2 x double> [[TMP72]], i32 1
+; I64-NEXT:    store double [[TMP85]], ptr [[TMP77]], align 8
+; I64-NEXT:    [[TMP86:%.*]] = extractelement <2 x double> [[TMP73]], i32 0
+; I64-NEXT:    store double [[TMP86]], ptr [[TMP78]], align 8
+; I64-NEXT:    [[TMP87:%.*]] = extractelement <2 x double> [[TMP73]], i32 1
+; I64-NEXT:    store double [[TMP87]], ptr [[TMP79]], align 8
+; I64-NEXT:    [[TMP88:%.*]] = extractelement <2 x double> [[TMP74]], i32 0
+; I64-NEXT:    store double [[TMP88]], ptr [[TMP80]], align 8
+; I64-NEXT:    [[TMP89:%.*]] = extractelement <2 x double> [[TMP74]], i32 1
+; I64-NEXT:    store double [[TMP89]], ptr [[TMP81]], align 8
+; I64-NEXT:    [[TMP90:%.*]] = extractelement <2 x double> [[TMP75]], i32 0
+; I64-NEXT:    store double [[TMP90]], ptr [[TMP82]], align 8
+; I64-NEXT:    [[TMP91:%.*]] = extractelement <2 x double> [[TMP75]], i32 1
+; I64-NEXT:    store double [[TMP91]], ptr [[TMP83]], align 8
+; I64-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; I64-NEXT:    [[TMP92:%.*]] = icmp eq i64 [[INDEX_NEXT]], 96
+; I64-NEXT:    br i1 [[TMP92]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; I64:       [[MIDDLE_BLOCK]]:
+; I64-NEXT:    br label %[[SCALAR_PH:.*]]
+; I64:       [[SCALAR_PH]]:
+;
+; I32-LABEL: define void @address_use_in_different_block(
+; I32-SAME: ptr noalias [[DST:%.*]], ptr [[SRC_0:%.*]], ptr [[SRC_1:%.*]], i32 [[X:%.*]]) #[[ATTR0]] {
+; I32-NEXT:  [[ENTRY:.*:]]
+; I32-NEXT:    [[X_POS:%.*]] = call i32 @llvm.smax.i32(i32 [[X]], i32 0)
+; I32-NEXT:    [[OFFSET:%.*]] = zext i32 [[X_POS]] to i64
+; I32-NEXT:    br label %[[VECTOR_PH:.*]]
+; I32:       [[VECTOR_PH]]:
+; I32-NEXT:    br label %[[VECTOR_BODY:.*]]
+; I32:       [[VECTOR_BODY]]:
+; I32-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; I32-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; I32-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
+; I32-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 2
+; I32-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 3
+; I32-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP0]], [[OFFSET]]
+; I32-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP1]], [[OFFSET]]
+; I32-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP2]], [[OFFSET]]
+; I32-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP3]], [[OFFSET]]
+; I32-NEXT:    [[TMP8:%.*]] = getelementptr i32, ptr [[SRC_0]], i64 [[TMP4]]
+; I32-NEXT:    [[TMP9:%.*]] = getelementptr i32, ptr [[SRC_0]], i64 [[TMP5]]
+; I32-NEXT:    [[TMP10:%.*]] = getelementptr i32, ptr [[SRC_0]], i64 [[TMP6]]
+; I32-NEXT:    [[TMP11:%.*]] = getelementptr i32, ptr [[SRC_0]], i64 [[TMP7]]
+; I32-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP8]], align 4
+; I32-NEXT:    [[TMP13:%.*]] = load i32, ptr [[TMP9]], align 4
+; I32-NEXT:    [[TMP14:%.*]] = load i32, ptr [[TMP10]], align 4
+; I32-NEXT:    [[TMP15:%.*]] = load i32, ptr [[TMP11]], align 4
+; I32-NEXT:    [[TMP16:%.*]] = sext i32 [[TMP12]] to i64
+; I32-NEXT:    [[TMP17:%.*]] = sext i32 [[TMP13]] to i64
+; I32-NEXT:    [[TMP18:%.*]] = sext i32 [[TMP14]] to i64
+; I32-NEXT:    [[TMP19:%.*]] = sext i32 [[TMP15]] to i64
+; I32-NEXT:    [[TMP20:%.*]] = getelementptr double, ptr [[SRC_1]], i64 [[TMP16]]
+; I32-NEXT:    [[TMP21:%.*]] = getelementptr double, ptr [[SRC_1]], i64 [[TMP17]]
+; I32-NEXT:    [[TMP22:%.*]] = getelementptr double, ptr [[SRC_1]], i64 [[TMP18]]
+; I32-NEXT:    [[TMP23:%.*]] = getelementptr double, ptr [[SRC_1]], i64 [[TMP19]]
+; I32-NEXT:    [[TMP24:%.*]] = getelementptr i8, ptr [[TMP20]], i64 -8
+; I32-NEXT:    [[TMP25:%.*]] = getelementptr i8, ptr [[TMP21]], i64 -8
+; I32-NEXT:    [[TMP26:%.*]] = getelementptr i8, ptr [[TMP22]], i64 -8
+; I32-NEXT:    [[TMP27:%.*]] = getelementptr i8, ptr [[TMP23]], i64 -8
+; I32-NEXT:    [[TMP28:%.*]] = load double, ptr [[TMP24]], align 8
+; I32-NEXT:    [[TMP29:%.*]] = load double, ptr [[TMP25]], align 8
+; I32-NEXT:    [[TMP30:%.*]] = load double, ptr [[TMP26]], align 8
+; I32-NEXT:    [[TMP31:%.*]] = load double, ptr [[TMP27]], align 8
+; I32-NEXT:    [[TMP32:%.*]] = insertelement <4 x double> poison, double [[TMP28]], i32 0
+; I32-NEXT:    [[TMP33:%.*]] = insertelement <4 x double> [[TMP32]], double [[TMP29]], i32 1
+; I32-NEXT:    [[TMP34:%.*]] = insertelement <4 x double> [[TMP33]], double [[TMP30]], i32 2
+; I32-NEXT:    [[TMP35:%.*]] = insertelement <4 x double> [[TMP34]], double [[TMP31]], i32 3
+; I32-NEXT:    [[TMP36:%.*]] = fsub <4 x double> zeroinitializer, [[TMP35]]
+; I32-NEXT:    [[TMP37:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP4]]
+; I32-NEXT:    [[TMP38:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP5]]
+; I32-NEXT:    [[TMP39:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP6]]
+; I32-NEXT:    [[TMP40:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP7]]
+; I32-NEXT:    [[TMP41:%.*]] = extractelement <4 x double> [[TMP36]], i32 0
+; I32-NEXT:    store double [[TMP41]], ptr [[TMP37]], align 8
+; I32-NEXT:    [[TMP42:%.*]] = extractelement <4 x double> [[TMP36]], i32 1
+; I32-NEXT:    store double [[TMP42]], ptr [[TMP38]], align 8
+; I32-NEXT:    [[TMP43:%.*]] = extractelement <4 x double> [[TMP36]], i32 2
+; I32-NEXT:    store double [[TMP43]], ptr [[TMP39]], align 8
+; I32-NEXT:    [[TMP44:%.*]] = extractelement <4 x double> [[TMP36]], i32 3
+; I32-NEXT:    store double [[TMP44]], ptr [[TMP40]], align 8
+; I32-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; I32-NEXT:    [[TMP45:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
+; I32-NEXT:    br i1 [[TMP45]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; I32:       [[MIDDLE_BLOCK]]:
+; I32-NEXT:    br label %[[SCALAR_PH:.*]]
+; I32:       [[SCALAR_PH]]:
+;
+entry:
+  %x.pos = call i32 @llvm.smax.i32(i32 %x, i32 0)
+  %offset = zext i32 %x.pos to i64
+  br label %loop.header
+
+loop.header:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ]
+  %7 = mul i64 %iv, %offset
+  %gep.src.0 = getelementptr i32, ptr %src.0, i64 %7
+  %l8 = load i32, ptr %gep.src.0, align 4
+  %c = icmp sgt i32 %x, 0
+  br i1 %c, label %loop.latch, label %then
+
+then:
+  br label %loop.latch
+
+loop.latch:
+  %l.ext = sext i32 %l8 to i64
+  %gep.src.1 = getelementptr double, ptr %src.1, i64 %l.ext
+  %13 = getelementptr i8, ptr %gep.src.1, i64 -8
+  %l.2 = load double, ptr %13, align 8
+  %sub = fsub double 0.000000e+00, %l.2
+  %gep.dst = getelementptr double, ptr %dst, i64 %7
+  store double %sub, ptr %gep.dst, align 8
+  %iv.next = add i64 %iv, 1
+  %ec = icmp eq i64 %iv, 100
+  br i1 %ec, label %exit, label %loop.header
+
+exit:
+  ret void
+}
+
 attributes #0 = { "target-cpu"="znver2" }
 
 !0 = distinct !{!0, !1}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/x86-predication.ll b/llvm/test/Transforms/LoopVectorize/X86/x86-predication.ll
index 774f0db..f293ed1 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/x86-predication.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/x86-predication.ll
@@ -186,12 +186,11 @@ define i32 @scalarize_and_sink_gather(ptr %a, i1 %c, i32 %x, i64 %n) {
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_UDIV_CONTINUE4:%.*]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_UDIV_CONTINUE4]] ]
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[PRED_UDIV_CONTINUE4]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = mul <2 x i64> [[VEC_IND]], splat (i64 777)
 ; CHECK-NEXT:    br i1 [[TMP1]], label [[PRED_UDIV_IF:%.*]], label [[PRED_UDIV_CONTINUE:%.*]]
 ; CHECK:       pred.udiv.if:
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i64> [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP0]], 777
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = udiv i32 [[TMP4]], [[X]]
@@ -201,7 +200,8 @@ define i32 @scalarize_and_sink_gather(ptr %a, i1 %c, i32 %x, i64 %n) {
 ; CHECK-NEXT:    [[TMP8:%.*]] = phi <2 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP6]], [[PRED_UDIV_IF]] ]
 ; CHECK-NEXT:    br i1 [[TMP1]], label [[PRED_UDIV_IF3:%.*]], label [[PRED_UDIV_CONTINUE4]]
 ; CHECK:       pred.udiv.if3:
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x i64> [[TMP0]], i32 1
+; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP7]], 777
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP10]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP11]], align 4
 ; CHECK-NEXT:    [[TMP13:%.*]] = udiv i32 [[TMP12]], [[X]]
@@ -212,7 +212,6 @@ define i32 @scalarize_and_sink_gather(ptr %a, i1 %c, i32 %x, i64 %n) {
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[BROADCAST_SPLAT]], <2 x i32> [[TMP16]], <2 x i32> [[BROADCAST_SPLAT4]]
 ; CHECK-NEXT:    [[TMP18]] = add <2 x i32> [[VEC_PHI]], [[PREDPHI]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
 ; CHECK-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       middle.block:
diff --git a/llvm/test/Transforms/SimplifyCFG/switch-range-to-icmp.ll b/llvm/test/Transforms/SimplifyCFG/switch-range-to-icmp.ll
index 0fc3c19..a43e762 100644
--- a/llvm/test/Transforms/SimplifyCFG/switch-range-to-icmp.ll
+++ b/llvm/test/Transforms/SimplifyCFG/switch-range-to-icmp.ll
@@ -401,4 +401,27 @@ b:
   ret i32 %1
 }
 
+define i32 @else_will_be_unreachable(i1 %arg) {
+; CHECK-LABEL: @else_will_be_unreachable(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[I:%.*]] = select i1 [[ARG:%.*]], i32 0, i32 1
+; CHECK-NEXT:    ret i32 [[I]]
+;
+entry:
+  switch i1 %arg, label %else [
+  i1 false, label %if
+  i1 true, label %if
+  ]
+
+if:
+  br i1 %arg, label %else, label %bb
+
+bb:
+  br label %else
+
+else:
+  %i = phi i32 [ 0, %entry ], [ 0, %if ], [ 1, %bb ]
+  ret i32 %i
+}
+
 declare void @bar(ptr nonnull dereferenceable(4))
diff --git a/llvm/test/tools/llvm-mca/RISCV/SiFiveX390/vector-fp.s b/llvm/test/tools/llvm-mca/RISCV/SiFiveX390/vector-fp.s
index e1e9b57..64e3ed9 100644
--- a/llvm/test/tools/llvm-mca/RISCV/SiFiveX390/vector-fp.s
+++ b/llvm/test/tools/llvm-mca/RISCV/SiFiveX390/vector-fp.s
@@ -2323,13 +2323,13 @@ vfncvt.rod.f.f.w v8, v16
 # CHECK-NEXT:  1      8     16.00                        8     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFNCVT_F_F_W vfncvt.f.f.w	v8, v16
 # CHECK-NEXT:  1      8     16.00                        8     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFNCVT_ROD_F_F_W vfncvt.rod.f.f.w	v8, v16
 # CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e64, m1, tu, mu
-# CHECK-NEXT:  1      16    16.00                        16    VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFADD_VV vfadd.vv	v8, v16, v24
-# CHECK-NEXT:  1      16    16.00                        16    VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFADD_VF vfadd.vf	v8, v16, fs0
-# CHECK-NEXT:  1      16    16.00                        16    VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFSUB_VV vfsub.vv	v8, v16, v24
-# CHECK-NEXT:  1      16    16.00                        16    VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFSUB_VF vfsub.vf	v8, v16, fs0
-# CHECK-NEXT:  1      16    16.00                        16    VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFRSUB_VF vfrsub.vf	v8, v16, fs0
-# CHECK-NEXT:  1      16    16.00                        16    VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFMUL_VV vfmul.vv	v8, v16, v24
-# CHECK-NEXT:  1      16    16.00                        16    VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFMUL_VF vfmul.vf	v8, v16, fs0
+# CHECK-NEXT:  1      23    16.00                        23    VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFADD_VV vfadd.vv	v8, v16, v24
+# CHECK-NEXT:  1      23    16.00                        23    VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFADD_VF vfadd.vf	v8, v16, fs0
+# CHECK-NEXT:  1      23    16.00                        23    VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFSUB_VV vfsub.vv	v8, v16, v24
+# CHECK-NEXT:  1      23    16.00                        23    VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFSUB_VF vfsub.vf	v8, v16, fs0
+# CHECK-NEXT:  1      23    16.00                        23    VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFRSUB_VF vfrsub.vf	v8, v16, fs0
+# CHECK-NEXT:  1      23    16.00                        23    VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFMUL_VV vfmul.vv	v8, v16, v24
+# CHECK-NEXT:  1      23    16.00                        23    VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFMUL_VF vfmul.vf	v8, v16, fs0
 # CHECK-NEXT:  1      228   228.00                       228   VLEN1024X300SiFive7VA1[1,229],VLEN1024X300SiFive7VA1OrVA2[1,229],VLEN1024X300SiFive7VCQ VFDIV_VV vfdiv.vv	v8, v16, v24
 # CHECK-NEXT:  1      228   228.00                       228   VLEN1024X300SiFive7VA1[1,229],VLEN1024X300SiFive7VA1OrVA2[1,229],VLEN1024X300SiFive7VCQ VFDIV_VF vfdiv.vf	v8, v16, fs0
 # CHECK-NEXT:  1      228   228.00                       228   VLEN1024X300SiFive7VA1[1,229],VLEN1024X300SiFive7VA1OrVA2[1,229],VLEN1024X300SiFive7VCQ VFRDIV_VF vfrdiv.vf	v8, v16, fs0
@@ -2352,22 +2352,22 @@ vfncvt.rod.f.f.w v8, v16
 # CHECK-NEXT:  1      228   228.00                       228   VLEN1024X300SiFive7VA1[1,229],VLEN1024X300SiFive7VA1OrVA2[1,229],VLEN1024X300SiFive7VCQ VFSQRT_V vfsqrt.v	v8, v24
 # CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1[1,3],VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFRSQRT7_V vfrsqrt7.v	v8, v24
 # CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1[1,3],VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFREC7_V vfrec7.v	v8, v24
-# CHECK-NEXT:  1      16    16.00                        16    VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFMIN_VV vfmin.vv	v8, v16, v24
-# CHECK-NEXT:  1      16    16.00                        16    VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFMIN_VF vfmin.vf	v8, v16, fs0
-# CHECK-NEXT:  1      16    16.00                        16    VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFMAX_VV vfmax.vv	v8, v16, v24
-# CHECK-NEXT:  1      16    16.00                        16    VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFMAX_VF vfmax.vf	v8, v16, fs0
-# CHECK-NEXT:  1      16    16.00                        16    VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFSGNJ_VV vfsgnj.vv	v8, v16, v24
-# CHECK-NEXT:  1      16    16.00                        16    VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFSGNJ_VF vfsgnj.vf	v8, v16, fs0
-# CHECK-NEXT:  1      16    16.00                        16    VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFSGNJN_VV vfsgnjn.vv	v8, v16, v24
-# CHECK-NEXT:  1      16    16.00                        16    VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFSGNJN_VF vfsgnjn.vf	v8, v16, fs0
-# CHECK-NEXT:  1      16    16.00                        16    VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFSGNJX_VV vfsgnjx.vv	v8, v16, v24
-# CHECK-NEXT:  1      16    16.00                        16    VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFSGNJX_VF vfsgnjx.vf	v8, v16, fs0
+# CHECK-NEXT:  1      19    16.00                        19    VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFMIN_VV vfmin.vv	v8, v16, v24
+# CHECK-NEXT:  1      19    16.00                        19    VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFMIN_VF vfmin.vf	v8, v16, fs0
+# CHECK-NEXT:  1      19    16.00                        19    VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFMAX_VV vfmax.vv	v8, v16, v24
+# CHECK-NEXT:  1      19    16.00                        19    VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFMAX_VF vfmax.vf	v8, v16, fs0
+# CHECK-NEXT:  1      19    16.00                        19    VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFSGNJ_VV vfsgnj.vv	v8, v16, v24
+# CHECK-NEXT:  1      19    16.00                        19    VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFSGNJ_VF vfsgnj.vf	v8, v16, fs0
+# CHECK-NEXT:  1      19    16.00                        19    VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFSGNJN_VV vfsgnjn.vv	v8, v16, v24
+# CHECK-NEXT:  1      19    16.00                        19    VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFSGNJN_VF vfsgnjn.vf	v8, v16, fs0
+# CHECK-NEXT:  1      19    16.00                        19    VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFSGNJX_VV vfsgnjx.vv	v8, v16, v24
+# CHECK-NEXT:  1      19    16.00                        19    VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFSGNJX_VF vfsgnjx.vf	v8, v16, fs0
 # CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1[1,3],VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFCVT_XU_F_V vfcvt.xu.f.v	v8, v16
 # CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1[1,3],VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFCVT_X_F_V vfcvt.x.f.v	v8, v16
 # CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1[1,3],VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFCVT_RTZ_XU_F_V vfcvt.rtz.xu.f.v	v8, v16
 # CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1[1,3],VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFCVT_RTZ_X_F_V vfcvt.rtz.x.f.v	v8, v16
-# CHECK-NEXT:  1      16    16.00                        16    VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFCVT_F_XU_V vfcvt.f.xu.v	v8, v16
-# CHECK-NEXT:  1      16    16.00                        16    VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFCVT_F_X_V vfcvt.f.x.v	v8, v16
+# CHECK-NEXT:  1      23    16.00                        23    VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFCVT_F_XU_V vfcvt.f.xu.v	v8, v16
+# CHECK-NEXT:  1      23    16.00                        23    VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFCVT_F_X_V vfcvt.f.x.v	v8, v16
 # CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1[1,3],VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFWCVT_XU_F_V vfwcvt.xu.f.v	v8, v16
 # CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1[1,3],VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFWCVT_X_F_V vfwcvt.x.f.v	v8, v16
 # CHECK-NEXT:  1      8     2.00                         8     VLEN1024X300SiFive7VA1[1,3],VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFWCVT_RTZ_XU_F_V vfwcvt.rtz.xu.f.v	v8, v16
@@ -2384,13 +2384,13 @@ vfncvt.rod.f.f.w v8, v16
 # CHECK-NEXT:  1      8     16.00                        8     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFNCVT_F_F_W vfncvt.f.f.w	v8, v16
 # CHECK-NEXT:  1      8     16.00                        8     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFNCVT_ROD_F_F_W vfncvt.rod.f.f.w	v8, v16
 # CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e64, m2, tu, mu
-# CHECK-NEXT:  1      32    32.00                        32    VLEN1024X300SiFive7VA1[1,33],VLEN1024X300SiFive7VA1OrVA2[1,33],VLEN1024X300SiFive7VCQ VFADD_VV vfadd.vv	v8, v16, v24
-# CHECK-NEXT:  1      32    32.00                        32    VLEN1024X300SiFive7VA1[1,33],VLEN1024X300SiFive7VA1OrVA2[1,33],VLEN1024X300SiFive7VCQ VFADD_VF vfadd.vf	v8, v16, fs0
-# CHECK-NEXT:  1      32    32.00                        32    VLEN1024X300SiFive7VA1[1,33],VLEN1024X300SiFive7VA1OrVA2[1,33],VLEN1024X300SiFive7VCQ VFSUB_VV vfsub.vv	v8, v16, v24
-# CHECK-NEXT:  1      32    32.00                        32    VLEN1024X300SiFive7VA1[1,33],VLEN1024X300SiFive7VA1OrVA2[1,33],VLEN1024X300SiFive7VCQ VFSUB_VF vfsub.vf	v8, v16, fs0
-# CHECK-NEXT:  1      32    32.00                        32    VLEN1024X300SiFive7VA1[1,33],VLEN1024X300SiFive7VA1OrVA2[1,33],VLEN1024X300SiFive7VCQ VFRSUB_VF vfrsub.vf	v8, v16, fs0
-# CHECK-NEXT:  1      32    32.00                        32    VLEN1024X300SiFive7VA1[1,33],VLEN1024X300SiFive7VA1OrVA2[1,33],VLEN1024X300SiFive7VCQ VFMUL_VV vfmul.vv	v8, v16, v24
-# CHECK-NEXT:  1      32    32.00                        32    VLEN1024X300SiFive7VA1[1,33],VLEN1024X300SiFive7VA1OrVA2[1,33],VLEN1024X300SiFive7VCQ VFMUL_VF vfmul.vf	v8, v16, fs0
+# CHECK-NEXT:  1      39    32.00                        39    VLEN1024X300SiFive7VA1[1,33],VLEN1024X300SiFive7VA1OrVA2[1,33],VLEN1024X300SiFive7VCQ VFADD_VV vfadd.vv	v8, v16, v24
+# CHECK-NEXT:  1      39    32.00                        39    VLEN1024X300SiFive7VA1[1,33],VLEN1024X300SiFive7VA1OrVA2[1,33],VLEN1024X300SiFive7VCQ VFADD_VF vfadd.vf	v8, v16, fs0
+# CHECK-NEXT:  1      39    32.00                        39    VLEN1024X300SiFive7VA1[1,33],VLEN1024X300SiFive7VA1OrVA2[1,33],VLEN1024X300SiFive7VCQ VFSUB_VV vfsub.vv	v8, v16, v24
+# CHECK-NEXT:  1      39    32.00                        39    VLEN1024X300SiFive7VA1[1,33],VLEN1024X300SiFive7VA1OrVA2[1,33],VLEN1024X300SiFive7VCQ VFSUB_VF vfsub.vf	v8, v16, fs0
+# CHECK-NEXT:  1      39    32.00                        39    VLEN1024X300SiFive7VA1[1,33],VLEN1024X300SiFive7VA1OrVA2[1,33],VLEN1024X300SiFive7VCQ VFRSUB_VF vfrsub.vf	v8, v16, fs0
+# CHECK-NEXT:  1      39    32.00                        39    VLEN1024X300SiFive7VA1[1,33],VLEN1024X300SiFive7VA1OrVA2[1,33],VLEN1024X300SiFive7VCQ VFMUL_VV vfmul.vv	v8, v16, v24
+# CHECK-NEXT:  1      39    32.00                        39    VLEN1024X300SiFive7VA1[1,33],VLEN1024X300SiFive7VA1OrVA2[1,33],VLEN1024X300SiFive7VCQ VFMUL_VF vfmul.vf	v8, v16, fs0
 # CHECK-NEXT:  1      456   456.00                       456   VLEN1024X300SiFive7VA1[1,457],VLEN1024X300SiFive7VA1OrVA2[1,457],VLEN1024X300SiFive7VCQ VFDIV_VV vfdiv.vv	v8, v16, v24
 # CHECK-NEXT:  1      456   456.00                       456   VLEN1024X300SiFive7VA1[1,457],VLEN1024X300SiFive7VA1OrVA2[1,457],VLEN1024X300SiFive7VCQ VFDIV_VF vfdiv.vf	v8, v16, fs0
 # CHECK-NEXT:  1      456   456.00                       456   VLEN1024X300SiFive7VA1[1,457],VLEN1024X300SiFive7VA1OrVA2[1,457],VLEN1024X300SiFive7VCQ VFRDIV_VF vfrdiv.vf	v8, v16, fs0
@@ -2413,22 +2413,22 @@ vfncvt.rod.f.f.w v8, v16
 # CHECK-NEXT:  1      456   456.00                       456   VLEN1024X300SiFive7VA1[1,457],VLEN1024X300SiFive7VA1OrVA2[1,457],VLEN1024X300SiFive7VCQ VFSQRT_V vfsqrt.v	v8, v24
 # CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1[1,5],VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFRSQRT7_V vfrsqrt7.v	v8, v24
 # CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1[1,5],VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFREC7_V vfrec7.v	v8, v24
-# CHECK-NEXT:  1      32    32.00                        32    VLEN1024X300SiFive7VA1[1,33],VLEN1024X300SiFive7VA1OrVA2[1,33],VLEN1024X300SiFive7VCQ VFMIN_VV vfmin.vv	v8, v16, v24
-# CHECK-NEXT:  1      32    32.00                        32    VLEN1024X300SiFive7VA1[1,33],VLEN1024X300SiFive7VA1OrVA2[1,33],VLEN1024X300SiFive7VCQ VFMIN_VF vfmin.vf	v8, v16, fs0
-# CHECK-NEXT:  1      32    32.00                        32    VLEN1024X300SiFive7VA1[1,33],VLEN1024X300SiFive7VA1OrVA2[1,33],VLEN1024X300SiFive7VCQ VFMAX_VV vfmax.vv	v8, v16, v24
-# CHECK-NEXT:  1      32    32.00                        32    VLEN1024X300SiFive7VA1[1,33],VLEN1024X300SiFive7VA1OrVA2[1,33],VLEN1024X300SiFive7VCQ VFMAX_VF vfmax.vf	v8, v16, fs0
-# CHECK-NEXT:  1      32    32.00                        32    VLEN1024X300SiFive7VA1[1,33],VLEN1024X300SiFive7VA1OrVA2[1,33],VLEN1024X300SiFive7VCQ VFSGNJ_VV vfsgnj.vv	v8, v16, v24
-# CHECK-NEXT:  1      32    32.00                        32    VLEN1024X300SiFive7VA1[1,33],VLEN1024X300SiFive7VA1OrVA2[1,33],VLEN1024X300SiFive7VCQ VFSGNJ_VF vfsgnj.vf	v8, v16, fs0
-# CHECK-NEXT:  1      32    32.00                        32    VLEN1024X300SiFive7VA1[1,33],VLEN1024X300SiFive7VA1OrVA2[1,33],VLEN1024X300SiFive7VCQ VFSGNJN_VV vfsgnjn.vv	v8, v16, v24
-# CHECK-NEXT:  1      32    32.00                        32    VLEN1024X300SiFive7VA1[1,33],VLEN1024X300SiFive7VA1OrVA2[1,33],VLEN1024X300SiFive7VCQ VFSGNJN_VF vfsgnjn.vf	v8, v16, fs0
-# CHECK-NEXT:  1      32    32.00                        32    VLEN1024X300SiFive7VA1[1,33],VLEN1024X300SiFive7VA1OrVA2[1,33],VLEN1024X300SiFive7VCQ VFSGNJX_VV vfsgnjx.vv	v8, v16, v24
-# CHECK-NEXT:  1      32    32.00                        32    VLEN1024X300SiFive7VA1[1,33],VLEN1024X300SiFive7VA1OrVA2[1,33],VLEN1024X300SiFive7VCQ VFSGNJX_VF vfsgnjx.vf	v8, v16, fs0
+# CHECK-NEXT:  1      35    32.00                        35    VLEN1024X300SiFive7VA1[1,33],VLEN1024X300SiFive7VA1OrVA2[1,33],VLEN1024X300SiFive7VCQ VFMIN_VV vfmin.vv	v8, v16, v24
+# CHECK-NEXT:  1      35    32.00                        35    VLEN1024X300SiFive7VA1[1,33],VLEN1024X300SiFive7VA1OrVA2[1,33],VLEN1024X300SiFive7VCQ VFMIN_VF vfmin.vf	v8, v16, fs0
+# CHECK-NEXT:  1      35    32.00                        35    VLEN1024X300SiFive7VA1[1,33],VLEN1024X300SiFive7VA1OrVA2[1,33],VLEN1024X300SiFive7VCQ VFMAX_VV vfmax.vv	v8, v16, v24
+# CHECK-NEXT:  1      35    32.00                        35    VLEN1024X300SiFive7VA1[1,33],VLEN1024X300SiFive7VA1OrVA2[1,33],VLEN1024X300SiFive7VCQ VFMAX_VF vfmax.vf	v8, v16, fs0
+# CHECK-NEXT:  1      35    32.00                        35    VLEN1024X300SiFive7VA1[1,33],VLEN1024X300SiFive7VA1OrVA2[1,33],VLEN1024X300SiFive7VCQ VFSGNJ_VV vfsgnj.vv	v8, v16, v24
+# CHECK-NEXT:  1      35    32.00                        35    VLEN1024X300SiFive7VA1[1,33],VLEN1024X300SiFive7VA1OrVA2[1,33],VLEN1024X300SiFive7VCQ VFSGNJ_VF vfsgnj.vf	v8, v16, fs0
+# CHECK-NEXT:  1      35    32.00                        35    VLEN1024X300SiFive7VA1[1,33],VLEN1024X300SiFive7VA1OrVA2[1,33],VLEN1024X300SiFive7VCQ VFSGNJN_VV vfsgnjn.vv	v8, v16, v24
+# CHECK-NEXT:  1      35    32.00                        35    VLEN1024X300SiFive7VA1[1,33],VLEN1024X300SiFive7VA1OrVA2[1,33],VLEN1024X300SiFive7VCQ VFSGNJN_VF vfsgnjn.vf	v8, v16, fs0
+# CHECK-NEXT:  1      35    32.00                        35    VLEN1024X300SiFive7VA1[1,33],VLEN1024X300SiFive7VA1OrVA2[1,33],VLEN1024X300SiFive7VCQ VFSGNJX_VV vfsgnjx.vv	v8, v16, v24
+# CHECK-NEXT:  1      35    32.00                        35    VLEN1024X300SiFive7VA1[1,33],VLEN1024X300SiFive7VA1OrVA2[1,33],VLEN1024X300SiFive7VCQ VFSGNJX_VF vfsgnjx.vf	v8, v16, fs0
 # CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1[1,5],VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFCVT_XU_F_V vfcvt.xu.f.v	v8, v16
 # CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1[1,5],VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFCVT_X_F_V vfcvt.x.f.v	v8, v16
 # CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1[1,5],VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFCVT_RTZ_XU_F_V vfcvt.rtz.xu.f.v	v8, v16
 # CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1[1,5],VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFCVT_RTZ_X_F_V vfcvt.rtz.x.f.v	v8, v16
-# CHECK-NEXT:  1      32    32.00                        32    VLEN1024X300SiFive7VA1[1,33],VLEN1024X300SiFive7VA1OrVA2[1,33],VLEN1024X300SiFive7VCQ VFCVT_F_XU_V vfcvt.f.xu.v	v8, v16
-# CHECK-NEXT:  1      32    32.00                        32    VLEN1024X300SiFive7VA1[1,33],VLEN1024X300SiFive7VA1OrVA2[1,33],VLEN1024X300SiFive7VCQ VFCVT_F_X_V vfcvt.f.x.v	v8, v16
+# CHECK-NEXT:  1      39    32.00                        39    VLEN1024X300SiFive7VA1[1,33],VLEN1024X300SiFive7VA1OrVA2[1,33],VLEN1024X300SiFive7VCQ VFCVT_F_XU_V vfcvt.f.xu.v	v8, v16
+# CHECK-NEXT:  1      39    32.00                        39    VLEN1024X300SiFive7VA1[1,33],VLEN1024X300SiFive7VA1OrVA2[1,33],VLEN1024X300SiFive7VCQ VFCVT_F_X_V vfcvt.f.x.v	v8, v16
 # CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1[1,5],VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFWCVT_XU_F_V vfwcvt.xu.f.v	v8, v16
 # CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1[1,5],VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFWCVT_X_F_V vfwcvt.x.f.v	v8, v16
 # CHECK-NEXT:  1      8     4.00                         8     VLEN1024X300SiFive7VA1[1,5],VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFWCVT_RTZ_XU_F_V vfwcvt.rtz.xu.f.v	v8, v16
@@ -2445,13 +2445,13 @@ vfncvt.rod.f.f.w v8, v16
 # CHECK-NEXT:  1      8     16.00                        8     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFNCVT_F_F_W vfncvt.f.f.w	v8, v16
 # CHECK-NEXT:  1      8     16.00                        8     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFNCVT_ROD_F_F_W vfncvt.rod.f.f.w	v8, v16
 # CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e64, m4, tu, mu
-# CHECK-NEXT:  1      64    64.00                        64    VLEN1024X300SiFive7VA1[1,65],VLEN1024X300SiFive7VA1OrVA2[1,65],VLEN1024X300SiFive7VCQ VFADD_VV vfadd.vv	v8, v16, v24
-# CHECK-NEXT:  1      64    64.00                        64    VLEN1024X300SiFive7VA1[1,65],VLEN1024X300SiFive7VA1OrVA2[1,65],VLEN1024X300SiFive7VCQ VFADD_VF vfadd.vf	v8, v16, fs0
-# CHECK-NEXT:  1      64    64.00                        64    VLEN1024X300SiFive7VA1[1,65],VLEN1024X300SiFive7VA1OrVA2[1,65],VLEN1024X300SiFive7VCQ VFSUB_VV vfsub.vv	v8, v16, v24
-# CHECK-NEXT:  1      64    64.00                        64    VLEN1024X300SiFive7VA1[1,65],VLEN1024X300SiFive7VA1OrVA2[1,65],VLEN1024X300SiFive7VCQ VFSUB_VF vfsub.vf	v8, v16, fs0
-# CHECK-NEXT:  1      64    64.00                        64    VLEN1024X300SiFive7VA1[1,65],VLEN1024X300SiFive7VA1OrVA2[1,65],VLEN1024X300SiFive7VCQ VFRSUB_VF vfrsub.vf	v8, v16, fs0
-# CHECK-NEXT:  1      64    64.00                        64    VLEN1024X300SiFive7VA1[1,65],VLEN1024X300SiFive7VA1OrVA2[1,65],VLEN1024X300SiFive7VCQ VFMUL_VV vfmul.vv	v8, v16, v24
-# CHECK-NEXT:  1      64    64.00                        64    VLEN1024X300SiFive7VA1[1,65],VLEN1024X300SiFive7VA1OrVA2[1,65],VLEN1024X300SiFive7VCQ VFMUL_VF vfmul.vf	v8, v16, fs0
+# CHECK-NEXT:  1      71    64.00                        71    VLEN1024X300SiFive7VA1[1,65],VLEN1024X300SiFive7VA1OrVA2[1,65],VLEN1024X300SiFive7VCQ VFADD_VV vfadd.vv	v8, v16, v24
+# CHECK-NEXT:  1      71    64.00                        71    VLEN1024X300SiFive7VA1[1,65],VLEN1024X300SiFive7VA1OrVA2[1,65],VLEN1024X300SiFive7VCQ VFADD_VF vfadd.vf	v8, v16, fs0
+# CHECK-NEXT:  1      71    64.00                        71    VLEN1024X300SiFive7VA1[1,65],VLEN1024X300SiFive7VA1OrVA2[1,65],VLEN1024X300SiFive7VCQ VFSUB_VV vfsub.vv	v8, v16, v24
+# CHECK-NEXT:  1      71    64.00                        71    VLEN1024X300SiFive7VA1[1,65],VLEN1024X300SiFive7VA1OrVA2[1,65],VLEN1024X300SiFive7VCQ VFSUB_VF vfsub.vf	v8, v16, fs0
+# CHECK-NEXT:  1      71    64.00                        71    VLEN1024X300SiFive7VA1[1,65],VLEN1024X300SiFive7VA1OrVA2[1,65],VLEN1024X300SiFive7VCQ VFRSUB_VF vfrsub.vf	v8, v16, fs0
+# CHECK-NEXT:  1      71    64.00                        71    VLEN1024X300SiFive7VA1[1,65],VLEN1024X300SiFive7VA1OrVA2[1,65],VLEN1024X300SiFive7VCQ VFMUL_VV vfmul.vv	v8, v16, v24
+# CHECK-NEXT:  1      71    64.00                        71    VLEN1024X300SiFive7VA1[1,65],VLEN1024X300SiFive7VA1OrVA2[1,65],VLEN1024X300SiFive7VCQ VFMUL_VF vfmul.vf	v8, v16, fs0
 # CHECK-NEXT:  1      912   912.00                       912   VLEN1024X300SiFive7VA1[1,913],VLEN1024X300SiFive7VA1OrVA2[1,913],VLEN1024X300SiFive7VCQ VFDIV_VV vfdiv.vv	v8, v16, v24
 # CHECK-NEXT:  1      912   912.00                       912   VLEN1024X300SiFive7VA1[1,913],VLEN1024X300SiFive7VA1OrVA2[1,913],VLEN1024X300SiFive7VCQ VFDIV_VF vfdiv.vf	v8, v16, fs0
 # CHECK-NEXT:  1      912   912.00                       912   VLEN1024X300SiFive7VA1[1,913],VLEN1024X300SiFive7VA1OrVA2[1,913],VLEN1024X300SiFive7VCQ VFRDIV_VF vfrdiv.vf	v8, v16, fs0
@@ -2474,22 +2474,22 @@ vfncvt.rod.f.f.w v8, v16
 # CHECK-NEXT:  1      912   912.00                       912   VLEN1024X300SiFive7VA1[1,913],VLEN1024X300SiFive7VA1OrVA2[1,913],VLEN1024X300SiFive7VCQ VFSQRT_V vfsqrt.v	v8, v24
 # CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFRSQRT7_V vfrsqrt7.v	v8, v24
 # CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFREC7_V vfrec7.v	v8, v24
-# CHECK-NEXT:  1      64    64.00                        64    VLEN1024X300SiFive7VA1[1,65],VLEN1024X300SiFive7VA1OrVA2[1,65],VLEN1024X300SiFive7VCQ VFMIN_VV vfmin.vv	v8, v16, v24
-# CHECK-NEXT:  1      64    64.00                        64    VLEN1024X300SiFive7VA1[1,65],VLEN1024X300SiFive7VA1OrVA2[1,65],VLEN1024X300SiFive7VCQ VFMIN_VF vfmin.vf	v8, v16, fs0
-# CHECK-NEXT:  1      64    64.00                        64    VLEN1024X300SiFive7VA1[1,65],VLEN1024X300SiFive7VA1OrVA2[1,65],VLEN1024X300SiFive7VCQ VFMAX_VV vfmax.vv	v8, v16, v24
-# CHECK-NEXT:  1      64    64.00                        64    VLEN1024X300SiFive7VA1[1,65],VLEN1024X300SiFive7VA1OrVA2[1,65],VLEN1024X300SiFive7VCQ VFMAX_VF vfmax.vf	v8, v16, fs0
-# CHECK-NEXT:  1      64    64.00                        64    VLEN1024X300SiFive7VA1[1,65],VLEN1024X300SiFive7VA1OrVA2[1,65],VLEN1024X300SiFive7VCQ VFSGNJ_VV vfsgnj.vv	v8, v16, v24
-# CHECK-NEXT:  1      64    64.00                        64    VLEN1024X300SiFive7VA1[1,65],VLEN1024X300SiFive7VA1OrVA2[1,65],VLEN1024X300SiFive7VCQ VFSGNJ_VF vfsgnj.vf	v8, v16, fs0
-# CHECK-NEXT:  1      64    64.00                        64    VLEN1024X300SiFive7VA1[1,65],VLEN1024X300SiFive7VA1OrVA2[1,65],VLEN1024X300SiFive7VCQ VFSGNJN_VV vfsgnjn.vv	v8, v16, v24
-# CHECK-NEXT:  1      64    64.00                        64    VLEN1024X300SiFive7VA1[1,65],VLEN1024X300SiFive7VA1OrVA2[1,65],VLEN1024X300SiFive7VCQ VFSGNJN_VF vfsgnjn.vf	v8, v16, fs0
-# CHECK-NEXT:  1      64    64.00                        64    VLEN1024X300SiFive7VA1[1,65],VLEN1024X300SiFive7VA1OrVA2[1,65],VLEN1024X300SiFive7VCQ VFSGNJX_VV vfsgnjx.vv	v8, v16, v24
-# CHECK-NEXT:  1      64    64.00                        64    VLEN1024X300SiFive7VA1[1,65],VLEN1024X300SiFive7VA1OrVA2[1,65],VLEN1024X300SiFive7VCQ VFSGNJX_VF vfsgnjx.vf	v8, v16, fs0
+# CHECK-NEXT:  1      67    64.00                        67    VLEN1024X300SiFive7VA1[1,65],VLEN1024X300SiFive7VA1OrVA2[1,65],VLEN1024X300SiFive7VCQ VFMIN_VV vfmin.vv	v8, v16, v24
+# CHECK-NEXT:  1      67    64.00                        67    VLEN1024X300SiFive7VA1[1,65],VLEN1024X300SiFive7VA1OrVA2[1,65],VLEN1024X300SiFive7VCQ VFMIN_VF vfmin.vf	v8, v16, fs0
+# CHECK-NEXT:  1      67    64.00                        67    VLEN1024X300SiFive7VA1[1,65],VLEN1024X300SiFive7VA1OrVA2[1,65],VLEN1024X300SiFive7VCQ VFMAX_VV vfmax.vv	v8, v16, v24
+# CHECK-NEXT:  1      67    64.00                        67    VLEN1024X300SiFive7VA1[1,65],VLEN1024X300SiFive7VA1OrVA2[1,65],VLEN1024X300SiFive7VCQ VFMAX_VF vfmax.vf	v8, v16, fs0
+# CHECK-NEXT:  1      67    64.00                        67    VLEN1024X300SiFive7VA1[1,65],VLEN1024X300SiFive7VA1OrVA2[1,65],VLEN1024X300SiFive7VCQ VFSGNJ_VV vfsgnj.vv	v8, v16, v24
+# CHECK-NEXT:  1      67    64.00                        67    VLEN1024X300SiFive7VA1[1,65],VLEN1024X300SiFive7VA1OrVA2[1,65],VLEN1024X300SiFive7VCQ VFSGNJ_VF vfsgnj.vf	v8, v16, fs0
+# CHECK-NEXT:  1      67    64.00                        67    VLEN1024X300SiFive7VA1[1,65],VLEN1024X300SiFive7VA1OrVA2[1,65],VLEN1024X300SiFive7VCQ VFSGNJN_VV vfsgnjn.vv	v8, v16, v24
+# CHECK-NEXT:  1      67    64.00                        67    VLEN1024X300SiFive7VA1[1,65],VLEN1024X300SiFive7VA1OrVA2[1,65],VLEN1024X300SiFive7VCQ VFSGNJN_VF vfsgnjn.vf	v8, v16, fs0
+# CHECK-NEXT:  1      67    64.00                        67    VLEN1024X300SiFive7VA1[1,65],VLEN1024X300SiFive7VA1OrVA2[1,65],VLEN1024X300SiFive7VCQ VFSGNJX_VV vfsgnjx.vv	v8, v16, v24
+# CHECK-NEXT:  1      67    64.00                        67    VLEN1024X300SiFive7VA1[1,65],VLEN1024X300SiFive7VA1OrVA2[1,65],VLEN1024X300SiFive7VCQ VFSGNJX_VF vfsgnjx.vf	v8, v16, fs0
 # CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFCVT_XU_F_V vfcvt.xu.f.v	v8, v16
 # CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFCVT_X_F_V vfcvt.x.f.v	v8, v16
 # CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFCVT_RTZ_XU_F_V vfcvt.rtz.xu.f.v	v8, v16
 # CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFCVT_RTZ_X_F_V vfcvt.rtz.x.f.v	v8, v16
-# CHECK-NEXT:  1      64    64.00                        64    VLEN1024X300SiFive7VA1[1,65],VLEN1024X300SiFive7VA1OrVA2[1,65],VLEN1024X300SiFive7VCQ VFCVT_F_XU_V vfcvt.f.xu.v	v8, v16
-# CHECK-NEXT:  1      64    64.00                        64    VLEN1024X300SiFive7VA1[1,65],VLEN1024X300SiFive7VA1OrVA2[1,65],VLEN1024X300SiFive7VCQ VFCVT_F_X_V vfcvt.f.x.v	v8, v16
+# CHECK-NEXT:  1      71    64.00                        71    VLEN1024X300SiFive7VA1[1,65],VLEN1024X300SiFive7VA1OrVA2[1,65],VLEN1024X300SiFive7VCQ VFCVT_F_XU_V vfcvt.f.xu.v	v8, v16
+# CHECK-NEXT:  1      71    64.00                        71    VLEN1024X300SiFive7VA1[1,65],VLEN1024X300SiFive7VA1OrVA2[1,65],VLEN1024X300SiFive7VCQ VFCVT_F_X_V vfcvt.f.x.v	v8, v16
 # CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWCVT_XU_F_V vfwcvt.xu.f.v	v8, v16
 # CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWCVT_X_F_V vfwcvt.x.f.v	v8, v16
 # CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWCVT_RTZ_XU_F_V vfwcvt.rtz.xu.f.v	v8, v16
@@ -2506,13 +2506,13 @@ vfncvt.rod.f.f.w v8, v16
 # CHECK-NEXT:  1      8     16.00                        8     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFNCVT_F_F_W vfncvt.f.f.w	v8, v16
 # CHECK-NEXT:  1      8     16.00                        8     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFNCVT_ROD_F_F_W vfncvt.rod.f.f.w	v8, v16
 # CHECK-NEXT:  1      3     1.00                  U      1     VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI            vsetvli	zero, zero, e64, m8, tu, mu
-# CHECK-NEXT:  1      128   128.00                       128   VLEN1024X300SiFive7VA1[1,129],VLEN1024X300SiFive7VA1OrVA2[1,129],VLEN1024X300SiFive7VCQ VFADD_VV vfadd.vv	v8, v16, v24
-# CHECK-NEXT:  1      128   128.00                       128   VLEN1024X300SiFive7VA1[1,129],VLEN1024X300SiFive7VA1OrVA2[1,129],VLEN1024X300SiFive7VCQ VFADD_VF vfadd.vf	v8, v16, fs0
-# CHECK-NEXT:  1      128   128.00                       128   VLEN1024X300SiFive7VA1[1,129],VLEN1024X300SiFive7VA1OrVA2[1,129],VLEN1024X300SiFive7VCQ VFSUB_VV vfsub.vv	v8, v16, v24
-# CHECK-NEXT:  1      128   128.00                       128   VLEN1024X300SiFive7VA1[1,129],VLEN1024X300SiFive7VA1OrVA2[1,129],VLEN1024X300SiFive7VCQ VFSUB_VF vfsub.vf	v8, v16, fs0
-# CHECK-NEXT:  1      128   128.00                       128   VLEN1024X300SiFive7VA1[1,129],VLEN1024X300SiFive7VA1OrVA2[1,129],VLEN1024X300SiFive7VCQ VFRSUB_VF vfrsub.vf	v8, v16, fs0
-# CHECK-NEXT:  1      128   128.00                       128   VLEN1024X300SiFive7VA1[1,129],VLEN1024X300SiFive7VA1OrVA2[1,129],VLEN1024X300SiFive7VCQ VFMUL_VV vfmul.vv	v8, v16, v24
-# CHECK-NEXT:  1      128   128.00                       128   VLEN1024X300SiFive7VA1[1,129],VLEN1024X300SiFive7VA1OrVA2[1,129],VLEN1024X300SiFive7VCQ VFMUL_VF vfmul.vf	v8, v16, fs0
+# CHECK-NEXT:  1      135   128.00                       135   VLEN1024X300SiFive7VA1[1,129],VLEN1024X300SiFive7VA1OrVA2[1,129],VLEN1024X300SiFive7VCQ VFADD_VV vfadd.vv	v8, v16, v24
+# CHECK-NEXT:  1      135   128.00                       135   VLEN1024X300SiFive7VA1[1,129],VLEN1024X300SiFive7VA1OrVA2[1,129],VLEN1024X300SiFive7VCQ VFADD_VF vfadd.vf	v8, v16, fs0
+# CHECK-NEXT:  1      135   128.00                       135   VLEN1024X300SiFive7VA1[1,129],VLEN1024X300SiFive7VA1OrVA2[1,129],VLEN1024X300SiFive7VCQ VFSUB_VV vfsub.vv	v8, v16, v24
+# CHECK-NEXT:  1      135   128.00                       135   VLEN1024X300SiFive7VA1[1,129],VLEN1024X300SiFive7VA1OrVA2[1,129],VLEN1024X300SiFive7VCQ VFSUB_VF vfsub.vf	v8, v16, fs0
+# CHECK-NEXT:  1      135   128.00                       135   VLEN1024X300SiFive7VA1[1,129],VLEN1024X300SiFive7VA1OrVA2[1,129],VLEN1024X300SiFive7VCQ VFRSUB_VF vfrsub.vf	v8, v16, fs0
+# CHECK-NEXT:  1      135   128.00                       135   VLEN1024X300SiFive7VA1[1,129],VLEN1024X300SiFive7VA1OrVA2[1,129],VLEN1024X300SiFive7VCQ VFMUL_VV vfmul.vv	v8, v16, v24
+# CHECK-NEXT:  1      135   128.00                       135   VLEN1024X300SiFive7VA1[1,129],VLEN1024X300SiFive7VA1OrVA2[1,129],VLEN1024X300SiFive7VCQ VFMUL_VF vfmul.vf	v8, v16, fs0
 # CHECK-NEXT:  1      1824  1824.00                      1824  VLEN1024X300SiFive7VA1[1,1825],VLEN1024X300SiFive7VA1OrVA2[1,1825],VLEN1024X300SiFive7VCQ VFDIV_VV vfdiv.vv	v8, v16, v24
 # CHECK-NEXT:  1      1824  1824.00                      1824  VLEN1024X300SiFive7VA1[1,1825],VLEN1024X300SiFive7VA1OrVA2[1,1825],VLEN1024X300SiFive7VCQ VFDIV_VF vfdiv.vf	v8, v16, fs0
 # CHECK-NEXT:  1      1824  1824.00                      1824  VLEN1024X300SiFive7VA1[1,1825],VLEN1024X300SiFive7VA1OrVA2[1,1825],VLEN1024X300SiFive7VCQ VFRDIV_VF vfrdiv.vf	v8, v16, fs0
@@ -2535,22 +2535,22 @@ vfncvt.rod.f.f.w v8, v16
 # CHECK-NEXT:  1      1824  1824.00                      1824  VLEN1024X300SiFive7VA1[1,1825],VLEN1024X300SiFive7VA1OrVA2[1,1825],VLEN1024X300SiFive7VCQ VFSQRT_V vfsqrt.v	v8, v24
 # CHECK-NEXT:  1      8     16.00                        8     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFRSQRT7_V vfrsqrt7.v	v8, v24
 # CHECK-NEXT:  1      8     16.00                        8     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFREC7_V vfrec7.v	v8, v24
-# CHECK-NEXT:  1      128   128.00                       128   VLEN1024X300SiFive7VA1[1,129],VLEN1024X300SiFive7VA1OrVA2[1,129],VLEN1024X300SiFive7VCQ VFMIN_VV vfmin.vv	v8, v16, v24
-# CHECK-NEXT:  1      128   128.00                       128   VLEN1024X300SiFive7VA1[1,129],VLEN1024X300SiFive7VA1OrVA2[1,129],VLEN1024X300SiFive7VCQ VFMIN_VF vfmin.vf	v8, v16, fs0
-# CHECK-NEXT:  1      128   128.00                       128   VLEN1024X300SiFive7VA1[1,129],VLEN1024X300SiFive7VA1OrVA2[1,129],VLEN1024X300SiFive7VCQ VFMAX_VV vfmax.vv	v8, v16, v24
-# CHECK-NEXT:  1      128   128.00                       128   VLEN1024X300SiFive7VA1[1,129],VLEN1024X300SiFive7VA1OrVA2[1,129],VLEN1024X300SiFive7VCQ VFMAX_VF vfmax.vf	v8, v16, fs0
-# CHECK-NEXT:  1      128   128.00                       128   VLEN1024X300SiFive7VA1[1,129],VLEN1024X300SiFive7VA1OrVA2[1,129],VLEN1024X300SiFive7VCQ VFSGNJ_VV vfsgnj.vv	v8, v16, v24
-# CHECK-NEXT:  1      128   128.00                       128   VLEN1024X300SiFive7VA1[1,129],VLEN1024X300SiFive7VA1OrVA2[1,129],VLEN1024X300SiFive7VCQ VFSGNJ_VF vfsgnj.vf	v8, v16, fs0
-# CHECK-NEXT:  1      128   128.00                       128   VLEN1024X300SiFive7VA1[1,129],VLEN1024X300SiFive7VA1OrVA2[1,129],VLEN1024X300SiFive7VCQ VFSGNJN_VV vfsgnjn.vv	v8, v16, v24
-# CHECK-NEXT:  1      128   128.00                       128   VLEN1024X300SiFive7VA1[1,129],VLEN1024X300SiFive7VA1OrVA2[1,129],VLEN1024X300SiFive7VCQ VFSGNJN_VF vfsgnjn.vf	v8, v16, fs0
-# CHECK-NEXT:  1      128   128.00                       128   VLEN1024X300SiFive7VA1[1,129],VLEN1024X300SiFive7VA1OrVA2[1,129],VLEN1024X300SiFive7VCQ VFSGNJX_VV vfsgnjx.vv	v8, v16, v24
-# CHECK-NEXT:  1      128   128.00                       128   VLEN1024X300SiFive7VA1[1,129],VLEN1024X300SiFive7VA1OrVA2[1,129],VLEN1024X300SiFive7VCQ VFSGNJX_VF vfsgnjx.vf	v8, v16, fs0
+# CHECK-NEXT:  1      131   128.00                       131   VLEN1024X300SiFive7VA1[1,129],VLEN1024X300SiFive7VA1OrVA2[1,129],VLEN1024X300SiFive7VCQ VFMIN_VV vfmin.vv	v8, v16, v24
+# CHECK-NEXT:  1      131   128.00                       131   VLEN1024X300SiFive7VA1[1,129],VLEN1024X300SiFive7VA1OrVA2[1,129],VLEN1024X300SiFive7VCQ VFMIN_VF vfmin.vf	v8, v16, fs0
+# CHECK-NEXT:  1      131   128.00                       131   VLEN1024X300SiFive7VA1[1,129],VLEN1024X300SiFive7VA1OrVA2[1,129],VLEN1024X300SiFive7VCQ VFMAX_VV vfmax.vv	v8, v16, v24
+# CHECK-NEXT:  1      131   128.00                       131   VLEN1024X300SiFive7VA1[1,129],VLEN1024X300SiFive7VA1OrVA2[1,129],VLEN1024X300SiFive7VCQ VFMAX_VF vfmax.vf	v8, v16, fs0
+# CHECK-NEXT:  1      131   128.00                       131   VLEN1024X300SiFive7VA1[1,129],VLEN1024X300SiFive7VA1OrVA2[1,129],VLEN1024X300SiFive7VCQ VFSGNJ_VV vfsgnj.vv	v8, v16, v24
+# CHECK-NEXT:  1      131   128.00                       131   VLEN1024X300SiFive7VA1[1,129],VLEN1024X300SiFive7VA1OrVA2[1,129],VLEN1024X300SiFive7VCQ VFSGNJ_VF vfsgnj.vf	v8, v16, fs0
+# CHECK-NEXT:  1      131   128.00                       131   VLEN1024X300SiFive7VA1[1,129],VLEN1024X300SiFive7VA1OrVA2[1,129],VLEN1024X300SiFive7VCQ VFSGNJN_VV vfsgnjn.vv	v8, v16, v24
+# CHECK-NEXT:  1      131   128.00                       131   VLEN1024X300SiFive7VA1[1,129],VLEN1024X300SiFive7VA1OrVA2[1,129],VLEN1024X300SiFive7VCQ VFSGNJN_VF vfsgnjn.vf	v8, v16, fs0
+# CHECK-NEXT:  1      131   128.00                       131   VLEN1024X300SiFive7VA1[1,129],VLEN1024X300SiFive7VA1OrVA2[1,129],VLEN1024X300SiFive7VCQ VFSGNJX_VV vfsgnjx.vv	v8, v16, v24
+# CHECK-NEXT:  1      131   128.00                       131   VLEN1024X300SiFive7VA1[1,129],VLEN1024X300SiFive7VA1OrVA2[1,129],VLEN1024X300SiFive7VCQ VFSGNJX_VF vfsgnjx.vf	v8, v16, fs0
 # CHECK-NEXT:  1      8     16.00                        8     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFCVT_XU_F_V vfcvt.xu.f.v	v8, v16
 # CHECK-NEXT:  1      8     16.00                        8     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFCVT_X_F_V vfcvt.x.f.v	v8, v16
 # CHECK-NEXT:  1      8     16.00                        8     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFCVT_RTZ_XU_F_V vfcvt.rtz.xu.f.v	v8, v16
 # CHECK-NEXT:  1      8     16.00                        8     VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFCVT_RTZ_X_F_V vfcvt.rtz.x.f.v	v8, v16
-# CHECK-NEXT:  1      128   128.00                       128   VLEN1024X300SiFive7VA1[1,129],VLEN1024X300SiFive7VA1OrVA2[1,129],VLEN1024X300SiFive7VCQ VFCVT_F_XU_V vfcvt.f.xu.v	v8, v16
-# CHECK-NEXT:  1      128   128.00                       128   VLEN1024X300SiFive7VA1[1,129],VLEN1024X300SiFive7VA1OrVA2[1,129],VLEN1024X300SiFive7VCQ VFCVT_F_X_V vfcvt.f.x.v	v8, v16
+# CHECK-NEXT:  1      135   128.00                       135   VLEN1024X300SiFive7VA1[1,129],VLEN1024X300SiFive7VA1OrVA2[1,129],VLEN1024X300SiFive7VCQ VFCVT_F_XU_V vfcvt.f.xu.v	v8, v16
+# CHECK-NEXT:  1      135   128.00                       135   VLEN1024X300SiFive7VA1[1,129],VLEN1024X300SiFive7VA1OrVA2[1,129],VLEN1024X300SiFive7VCQ VFCVT_F_X_V vfcvt.f.x.v	v8, v16
 # CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWCVT_XU_F_V vfwcvt.xu.f.v	v8, v16
 # CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWCVT_X_F_V vfwcvt.x.f.v	v8, v16
 # CHECK-NEXT:  1      8     8.00                         8     VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWCVT_RTZ_XU_F_V vfwcvt.rtz.xu.f.v	v8, v16
diff --git a/llvm/tools/llc/llc.cpp b/llvm/tools/llc/llc.cpp
index 7551a80..f04b256 100644
--- a/llvm/tools/llc/llc.cpp
+++ b/llvm/tools/llc/llc.cpp
@@ -44,6 +44,7 @@
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/FormattedStream.h"
 #include "llvm/Support/InitLLVM.h"
+#include "llvm/Support/PGOOptions.h"
 #include "llvm/Support/PluginLoader.h"
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/TargetSelect.h"
@@ -243,6 +244,39 @@ static cl::opt<RunPassOption, true, cl::parser<std::string>> RunPass(
     cl::desc("Run compiler only for specified passes (comma separated list)"),
     cl::value_desc("pass-name"), cl::location(RunPassOpt));
 
+// PGO command line options
+enum PGOKind {
+  NoPGO,
+  SampleUse,
+};
+
+static cl::opt<PGOKind>
+    PGOKindFlag("pgo-kind", cl::init(NoPGO), cl::Hidden,
+                cl::desc("The kind of profile guided optimization"),
+                cl::values(clEnumValN(NoPGO, "nopgo", "Do not use PGO."),
+                           clEnumValN(SampleUse, "pgo-sample-use-pipeline",
+                                      "Use sampled profile to guide PGO.")));
+
+// Function to set PGO options on TargetMachine based on command line flags.
+static void setPGOOptions(TargetMachine &TM) {
+  std::optional<PGOOptions> PGOOpt;
+
+  switch (PGOKindFlag) {
+  case SampleUse:
+    // Use default values for other PGOOptions parameters. This parameter
+    // is used to test that PGO data is preserved at -O0.
+    PGOOpt = PGOOptions("", "", "", "", PGOOptions::SampleUse,
+                        PGOOptions::NoCSAction);
+    break;
+  case NoPGO:
+    PGOOpt = std::nullopt;
+    break;
+  }
+
+  if (PGOOpt)
+    TM.setPGOOption(PGOOpt);
+}
+
 static int compileModule(char **, LLVMContext &);
 
 [[noreturn]] static void reportError(Twine Msg, StringRef Filename = "") {
@@ -558,6 +592,9 @@ static int compileModule(char **argv, LLVMContext &Context) {
           TheTriple, CPUStr, FeaturesStr, Options, RM, CM, OLvl));
       assert(Target && "Could not allocate target machine!");
 
+      // Set PGO options based on command line flags
+      setPGOOptions(*Target);
+
       return Target->createDataLayout().getStringRepresentation();
     };
     if (InputLanguage == "mir" ||
@@ -601,6 +638,9 @@ static int compileModule(char **argv, LLVMContext &Context) {
         TheTriple, CPUStr, FeaturesStr, Options, RM, CM, OLvl));
     assert(Target && "Could not allocate target machine!");
 
+    // Set PGO options based on command line flags
+    setPGOOptions(*Target);
+
     // If we don't have a module then just exit now. We do this down
     // here since the CPU/Feature help is underneath the target machine
     // creation.
diff --git a/llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp b/llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp
index 434449c..1031932 100644
--- a/llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp
+++ b/llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp
@@ -253,25 +253,17 @@ public:
       break;
     }
     case BasicBlockLevel: {
-      const auto &BBVecMap = Emb->getBBVecMap();
       for (const BasicBlock &BB : F) {
-        auto It = BBVecMap.find(&BB);
-        if (It != BBVecMap.end()) {
-          OS << BB.getName() << ":";
-          It->second.print(OS);
-        }
+        OS << BB.getName() << ":";
+        Emb->getBBVector(BB).print(OS);
       }
       break;
     }
     case InstructionLevel: {
-      const auto &InstMap = Emb->getInstVecMap();
       for (const BasicBlock &BB : F) {
         for (const Instruction &I : BB) {
-          auto It = InstMap.find(&I);
-          if (It != InstMap.end()) {
-            I.print(OS);
-            It->second.print(OS);
-          }
+          I.print(OS);
+          Emb->getInstVector(I).print(OS);
         }
       }
       break;
diff --git a/llvm/unittests/Analysis/IR2VecTest.cpp b/llvm/unittests/Analysis/IR2VecTest.cpp
index 40b4aa2..8ffc5f6 100644
--- a/llvm/unittests/Analysis/IR2VecTest.cpp
+++ b/llvm/unittests/Analysis/IR2VecTest.cpp
@@ -30,7 +30,9 @@ namespace {
 class TestableEmbedder : public Embedder {
 public:
   TestableEmbedder(const Function &F, const Vocabulary &V) : Embedder(F, V) {}
-  void computeEmbeddings(const BasicBlock &BB) const override {}
+  Embedding computeEmbeddings(const Instruction &I) const override {
+    return Embedding();
+  }
 };
 
 TEST(EmbeddingTest, ConstructorsAndAccessors) {
@@ -321,18 +323,12 @@ protected:
   }
 };
 
-TEST_F(IR2VecTestFixture, GetInstVecMap_Symbolic) {
+TEST_F(IR2VecTestFixture, GetInstVec_Symbolic) {
   auto Emb = Embedder::create(IR2VecKind::Symbolic, *F, *V);
   ASSERT_TRUE(static_cast<bool>(Emb));
 
-  const auto &InstMap = Emb->getInstVecMap();
-
-  EXPECT_EQ(InstMap.size(), 2u);
-  EXPECT_TRUE(InstMap.count(AddInst));
-  EXPECT_TRUE(InstMap.count(RetInst));
-
-  const auto &AddEmb = InstMap.at(AddInst);
-  const auto &RetEmb = InstMap.at(RetInst);
+  const auto &AddEmb = Emb->getInstVector(*AddInst);
+  const auto &RetEmb = Emb->getInstVector(*RetInst);
   EXPECT_EQ(AddEmb.size(), 2u);
   EXPECT_EQ(RetEmb.size(), 2u);
 
@@ -340,51 +336,17 @@ TEST_F(IR2VecTestFixture, GetInstVecMap_Symbolic) {
   EXPECT_TRUE(RetEmb.approximatelyEquals(Embedding(2, 15.5)));
 }
 
-TEST_F(IR2VecTestFixture, GetInstVecMap_FlowAware) {
-  auto Emb = Embedder::create(IR2VecKind::FlowAware, *F, *V);
-  ASSERT_TRUE(static_cast<bool>(Emb));
-
-  const auto &InstMap = Emb->getInstVecMap();
-
-  EXPECT_EQ(InstMap.size(), 2u);
-  EXPECT_TRUE(InstMap.count(AddInst));
-  EXPECT_TRUE(InstMap.count(RetInst));
-
-  EXPECT_EQ(InstMap.at(AddInst).size(), 2u);
-  EXPECT_EQ(InstMap.at(RetInst).size(), 2u);
-
-  EXPECT_TRUE(InstMap.at(AddInst).approximatelyEquals(Embedding(2, 25.5)));
-  EXPECT_TRUE(InstMap.at(RetInst).approximatelyEquals(Embedding(2, 32.6)));
-}
-
-TEST_F(IR2VecTestFixture, GetBBVecMap_Symbolic) {
-  auto Emb = Embedder::create(IR2VecKind::Symbolic, *F, *V);
-  ASSERT_TRUE(static_cast<bool>(Emb));
-
-  const auto &BBMap = Emb->getBBVecMap();
-
-  EXPECT_EQ(BBMap.size(), 1u);
-  EXPECT_TRUE(BBMap.count(BB));
-  EXPECT_EQ(BBMap.at(BB).size(), 2u);
-
-  // BB vector should be sum of add and ret: {25.5, 25.5} + {15.5, 15.5} =
-  // {41.0, 41.0}
-  EXPECT_TRUE(BBMap.at(BB).approximatelyEquals(Embedding(2, 41.0)));
-}
-
-TEST_F(IR2VecTestFixture, GetBBVecMap_FlowAware) {
+TEST_F(IR2VecTestFixture, GetInstVec_FlowAware) {
   auto Emb = Embedder::create(IR2VecKind::FlowAware, *F, *V);
   ASSERT_TRUE(static_cast<bool>(Emb));
 
-  const auto &BBMap = Emb->getBBVecMap();
-
-  EXPECT_EQ(BBMap.size(), 1u);
-  EXPECT_TRUE(BBMap.count(BB));
-  EXPECT_EQ(BBMap.at(BB).size(), 2u);
+  const auto &AddEmb = Emb->getInstVector(*AddInst);
+  const auto &RetEmb = Emb->getInstVector(*RetInst);
+  EXPECT_EQ(AddEmb.size(), 2u);
+  EXPECT_EQ(RetEmb.size(), 2u);
 
-  // BB vector should be sum of add and ret: {25.5, 25.5} + {32.6, 32.6} =
-  // {58.1, 58.1}
-  EXPECT_TRUE(BBMap.at(BB).approximatelyEquals(Embedding(2, 58.1)));
+  EXPECT_TRUE(AddEmb.approximatelyEquals(Embedding(2, 25.5)));
+  EXPECT_TRUE(RetEmb.approximatelyEquals(Embedding(2, 32.6)));
 }
 
 TEST_F(IR2VecTestFixture, GetBBVector_Symbolic) {
@@ -394,6 +356,8 @@ TEST_F(IR2VecTestFixture, GetBBVector_Symbolic) {
   const auto &BBVec = Emb->getBBVector(*BB);
 
   EXPECT_EQ(BBVec.size(), 2u);
+  // BB vector should be sum of add and ret: {25.5, 25.5} + {15.5, 15.5} =
+  // {41.0, 41.0}
   EXPECT_TRUE(BBVec.approximatelyEquals(Embedding(2, 41.0)));
 }
 
@@ -404,6 +368,8 @@ TEST_F(IR2VecTestFixture, GetBBVector_FlowAware) {
   const auto &BBVec = Emb->getBBVector(*BB);
 
   EXPECT_EQ(BBVec.size(), 2u);
+  // BB vector should be sum of add and ret: {25.5, 25.5} + {32.6, 32.6} =
+  // {58.1, 58.1}
   EXPECT_TRUE(BBVec.approximatelyEquals(Embedding(2, 58.1)));
 }
 
@@ -446,15 +412,9 @@ TEST_F(IR2VecTestFixture, MultipleComputeEmbeddingsConsistency_Symbolic) {
   EXPECT_TRUE(FuncVec1.approximatelyEquals(FuncVec3));
   EXPECT_TRUE(FuncVec2.approximatelyEquals(FuncVec3));
 
-  // Also check that instruction vectors remain consistent
-  const auto &InstMap1 = Emb->getInstVecMap();
-  const auto &InstMap2 = Emb->getInstVecMap();
-
-  EXPECT_EQ(InstMap1.size(), InstMap2.size());
-  for (const auto &[Inst, Vec1] : InstMap1) {
-    ASSERT_TRUE(InstMap2.count(Inst));
-    EXPECT_TRUE(Vec1.approximatelyEquals(InstMap2.at(Inst)));
-  }
+  Emb->invalidateEmbeddings();
+  const auto &FuncVec4 = Emb->getFunctionVector();
+  EXPECT_TRUE(FuncVec1.approximatelyEquals(FuncVec4));
 }
 
 TEST_F(IR2VecTestFixture, MultipleComputeEmbeddingsConsistency_FlowAware) {
@@ -473,15 +433,9 @@ TEST_F(IR2VecTestFixture, MultipleComputeEmbeddingsConsistency_FlowAware) {
   EXPECT_TRUE(FuncVec1.approximatelyEquals(FuncVec3));
   EXPECT_TRUE(FuncVec2.approximatelyEquals(FuncVec3));
 
-  // Also check that instruction vectors remain consistent
-  const auto &InstMap1 = Emb->getInstVecMap();
-  const auto &InstMap2 = Emb->getInstVecMap();
-
-  EXPECT_EQ(InstMap1.size(), InstMap2.size());
-  for (const auto &[Inst, Vec1] : InstMap1) {
-    ASSERT_TRUE(InstMap2.count(Inst));
-    EXPECT_TRUE(Vec1.approximatelyEquals(InstMap2.at(Inst)));
-  }
+  Emb->invalidateEmbeddings();
+  const auto &FuncVec4 = Emb->getFunctionVector();
+  EXPECT_TRUE(FuncVec1.approximatelyEquals(FuncVec4));
 }
 
 static constexpr unsigned MaxOpcodes = Vocabulary::MaxOpcodes;
diff --git a/llvm/unittests/Support/MustacheTest.cpp b/llvm/unittests/Support/MustacheTest.cpp
index e2c4422..3cad4a4 100644
--- a/llvm/unittests/Support/MustacheTest.cpp
+++ b/llvm/unittests/Support/MustacheTest.cpp
@@ -22,7 +22,10 @@ using namespace llvm::json;
 TEST(MustacheInterpolation, NoInterpolation) {
   // Mustache-free templates should render as-is.
   Value D = {};
-  Template T("Hello from {Mustache}!\n");
+  BumpPtrAllocator Allocator;
+  StringSaver Saver(Allocator);
+  MustacheContext Ctx(Allocator, Saver);
+  Template T("Hello from {Mustache}!\n", Ctx);
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -32,7 +35,10 @@ TEST(MustacheInterpolation, NoInterpolation) {
 TEST(MustacheInterpolation, BasicInterpolation) {
   // Unadorned tags should interpolate content into the template.
   Value D = Object{{"subject", "World"}};
-  Template T("Hello, {{subject}}!");
+  BumpPtrAllocator Allocator;
+  StringSaver Saver(Allocator);
+  MustacheContext Ctx(Allocator, Saver);
+  Template T("Hello, {{subject}}!", Ctx);
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -42,7 +48,10 @@ TEST(MustacheInterpolation, BasicInterpolation) {
 TEST(MustacheInterpolation, NoReinterpolation) {
   // Interpolated tag output should not be re-interpolated.
   Value D = Object{{"template", "{{planet}}"}, {"planet", "Earth"}};
-  Template T("{{template}}: {{planet}}");
+  BumpPtrAllocator Allocator;
+  StringSaver Saver(Allocator);
+  MustacheContext Ctx(Allocator, Saver);
+  Template T("{{template}}: {{planet}}", Ctx);
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -54,7 +63,10 @@ TEST(MustacheInterpolation, HTMLEscaping) {
   Value D = Object{
       {"forbidden", "& \" < >"},
   };
-  Template T("These characters should be HTML escaped: {{forbidden}}\n");
+  BumpPtrAllocator Allocator;
+  StringSaver Saver(Allocator);
+  MustacheContext Ctx(Allocator, Saver);
+  Template T("These characters should be HTML escaped: {{forbidden}}\n", Ctx);
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -67,7 +79,11 @@ TEST(MustacheInterpolation, Ampersand) {
   Value D = Object{
       {"forbidden", "& \" < >"},
   };
-  Template T("These characters should not be HTML escaped: {{&forbidden}}\n");
+  BumpPtrAllocator Allocator;
+  StringSaver Saver(Allocator);
+  MustacheContext Ctx(Allocator, Saver);
+  Template T("These characters should not be HTML escaped: {{&forbidden}}\n",
+             Ctx);
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -77,7 +93,10 @@ TEST(MustacheInterpolation, Ampersand) {
 TEST(MustacheInterpolation, BasicIntegerInterpolation) {
   // Integers should interpolate seamlessly.
   Value D = Object{{"mph", 85}};
-  Template T("{{mph}} miles an hour!");
+  BumpPtrAllocator Allocator;
+  StringSaver Saver(Allocator);
+  MustacheContext Ctx(Allocator, Saver);
+  Template T("{{mph}} miles an hour!", Ctx);
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -87,7 +106,10 @@ TEST(MustacheInterpolation, BasicIntegerInterpolation) {
 TEST(MustacheInterpolation, AmpersandIntegerInterpolation) {
   // Integers should interpolate seamlessly.
   Value D = Object{{"mph", 85}};
-  Template T("{{&mph}} miles an hour!");
+  BumpPtrAllocator Allocator;
+  StringSaver Saver(Allocator);
+  MustacheContext Ctx(Allocator, Saver);
+  Template T("{{&mph}} miles an hour!", Ctx);
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -97,7 +119,10 @@ TEST(MustacheInterpolation, AmpersandIntegerInterpolation) {
 TEST(MustacheInterpolation, BasicDecimalInterpolation) {
   // Decimals should interpolate seamlessly with proper significance.
   Value D = Object{{"power", 1.21}};
-  Template T("{{power}} jiggawatts!");
+  BumpPtrAllocator Allocator;
+  StringSaver Saver(Allocator);
+  MustacheContext Ctx(Allocator, Saver);
+  Template T("{{power}} jiggawatts!", Ctx);
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -107,7 +132,10 @@ TEST(MustacheInterpolation, BasicDecimalInterpolation) {
 TEST(MustacheInterpolation, BasicNullInterpolation) {
   // Nulls should interpolate as the empty string.
   Value D = Object{{"cannot", nullptr}};
-  Template T("I ({{cannot}}) be seen!");
+  BumpPtrAllocator Allocator;
+  StringSaver Saver(Allocator);
+  MustacheContext Ctx(Allocator, Saver);
+  Template T("I ({{cannot}}) be seen!", Ctx);
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -117,7 +145,10 @@ TEST(MustacheInterpolation, BasicNullInterpolation) {
 TEST(MustacheInterpolation, AmpersandNullInterpolation) {
   // Nulls should interpolate as the empty string.
   Value D = Object{{"cannot", nullptr}};
-  Template T("I ({{&cannot}}) be seen!");
+  BumpPtrAllocator Allocator;
+  StringSaver Saver(Allocator);
+  MustacheContext Ctx(Allocator, Saver);
+  Template T("I ({{&cannot}}) be seen!", Ctx);
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -127,7 +158,10 @@ TEST(MustacheInterpolation, AmpersandNullInterpolation) {
 TEST(MustacheInterpolation, BasicContextMissInterpolation) {
   // Failed context lookups should default to empty strings.
   Value D = Object{};
-  Template T("I ({{cannot}}) be seen!");
+  BumpPtrAllocator Allocator;
+  StringSaver Saver(Allocator);
+  MustacheContext Ctx(Allocator, Saver);
+  Template T("I ({{cannot}}) be seen!", Ctx);
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -137,7 +171,10 @@ TEST(MustacheInterpolation, BasicContextMissInterpolation) {
 TEST(MustacheInterpolation, DottedNamesBasicInterpolation) {
   // Dotted names should be considered a form of shorthand for sections.
   Value D = Object{{"person", Object{{"name", "Joe"}}}};
-  Template T("{{person.name}} == {{#person}}{{name}}{{/person}}");
+  BumpPtrAllocator Allocator;
+  StringSaver Saver(Allocator);
+  MustacheContext Ctx(Allocator, Saver);
+  Template T("{{person.name}} == {{#person}}{{name}}{{/person}}", Ctx);
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -147,7 +184,10 @@ TEST(MustacheInterpolation, DottedNamesBasicInterpolation) {
 TEST(MustacheInterpolation, DottedNamesAmpersandInterpolation) {
   // Dotted names should be considered a form of shorthand for sections.
   Value D = Object{{"person", Object{{"name", "Joe"}}}};
-  Template T("{{&person.name}} == {{#person}}{{&name}}{{/person}}");
+  BumpPtrAllocator Allocator;
+  StringSaver Saver(Allocator);
+  MustacheContext Ctx(Allocator, Saver);
+  Template T("{{&person.name}} == {{#person}}{{&name}}{{/person}}", Ctx);
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -162,7 +202,10 @@ TEST(MustacheInterpolation, DottedNamesArbitraryDepth) {
                Object{{"c",
                        Object{{"d",
                                Object{{"e", Object{{"name", "Phil"}}}}}}}}}}}};
-  Template T("{{a.b.c.d.e.name}}");
+  BumpPtrAllocator Allocator;
+  StringSaver Saver(Allocator);
+  MustacheContext Ctx(Allocator, Saver);
+  Template T("{{a.b.c.d.e.name}}", Ctx);
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -172,7 +215,10 @@ TEST(MustacheInterpolation, DottedNamesArbitraryDepth) {
 TEST(MustacheInterpolation, DottedNamesBrokenChains) {
   // Any falsey value prior to the last part of the name should yield ''.
   Value D = Object{{"a", Object{}}};
-  Template T("{{a.b.c}} == ");
+  BumpPtrAllocator Allocator;
+  StringSaver Saver(Allocator);
+  MustacheContext Ctx(Allocator, Saver);
+  Template T("{{a.b.c}} == ", Ctx);
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -183,7 +229,10 @@ TEST(MustacheInterpolation, DottedNamesBrokenChainResolution) {
   // Each part of a dotted name should resolve only against its parent.
   Value D =
       Object{{"a", Object{{"b", Object{}}}}, {"c", Object{{"name", "Jim"}}}};
-  Template T("{{a.b.c.name}} == ");
+  BumpPtrAllocator Allocator;
+  StringSaver Saver(Allocator);
+  MustacheContext Ctx(Allocator, Saver);
+  Template T("{{a.b.c.name}} == ", Ctx);
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -200,7 +249,10 @@ TEST(MustacheInterpolation, DottedNamesInitialResolution) {
                     Object{{"d", Object{{"e", Object{{"name", "Phil"}}}}}}}}}}},
       {"b",
        Object{{"c", Object{{"d", Object{{"e", Object{{"name", "Wrong"}}}}}}}}}};
-  Template T("{{#a}}{{b.c.d.e.name}}{{/a}}");
+  BumpPtrAllocator Allocator;
+  StringSaver Saver(Allocator);
+  MustacheContext Ctx(Allocator, Saver);
+  Template T("{{#a}}{{b.c.d.e.name}}{{/a}}", Ctx);
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -211,7 +263,10 @@ TEST(MustacheInterpolation, DottedNamesContextPrecedence) {
   // Dotted names should be resolved against former resolutions.
   Value D =
       Object{{"a", Object{{"b", Object{}}}}, {"b", Object{{"c", "ERROR"}}}};
-  Template T("{{#a}}{{b.c}}{{/a}}");
+  BumpPtrAllocator Allocator;
+  StringSaver Saver(Allocator);
+  MustacheContext Ctx(Allocator, Saver);
+  Template T("{{#a}}{{b.c}}{{/a}}", Ctx);
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -221,7 +276,10 @@ TEST(MustacheInterpolation, DottedNamesContextPrecedence) {
 TEST(MustacheInterpolation, DottedNamesAreNotSingleKeys) {
   // Dotted names shall not be parsed as single, atomic keys
   Value D = Object{{"a.b", "c"}};
-  Template T("{{a.b}}");
+  BumpPtrAllocator Allocator;
+  StringSaver Saver(Allocator);
+  MustacheContext Ctx(Allocator, Saver);
+  Template T("{{a.b}}", Ctx);
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -231,7 +289,10 @@ TEST(MustacheInterpolation, DottedNamesAreNotSingleKeys) {
 TEST(MustacheInterpolation, DottedNamesNoMasking) {
   // Dotted Names in a given context are unavailable due to dot splitting
   Value D = Object{{"a.b", "c"}, {"a", Object{{"b", "d"}}}};
-  Template T("{{a.b}}");
+  BumpPtrAllocator Allocator;
+  StringSaver Saver(Allocator);
+  MustacheContext Ctx(Allocator, Saver);
+  Template T("{{a.b}}", Ctx);
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -241,7 +302,10 @@ TEST(MustacheInterpolation, DottedNamesNoMasking) {
 TEST(MustacheInterpolation, ImplicitIteratorsBasicInterpolation) {
   // Unadorned tags should interpolate content into the template.
   Value D = "world";
-  Template T("Hello, {{.}}!\n");
+  BumpPtrAllocator Allocator;
+  StringSaver Saver(Allocator);
+  MustacheContext Ctx(Allocator, Saver);
+  Template T("Hello, {{.}}!\n", Ctx);
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -251,7 +315,10 @@ TEST(MustacheInterpolation, ImplicitIteratorsBasicInterpolation) {
 TEST(MustacheInterpolation, ImplicitIteratorsAmersand) {
   // Basic interpolation should be HTML escaped.
   Value D = "& \" < >";
-  Template T("These characters should not be HTML escaped: {{&.}}\n");
+  BumpPtrAllocator Allocator;
+  StringSaver Saver(Allocator);
+  MustacheContext Ctx(Allocator, Saver);
+  Template T("These characters should not be HTML escaped: {{&.}}\n", Ctx);
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -261,7 +328,10 @@ TEST(MustacheInterpolation, ImplicitIteratorsAmersand) {
 TEST(MustacheInterpolation, ImplicitIteratorsInteger) {
   // Integers should interpolate seamlessly.
   Value D = 85;
-  Template T("{{.}} miles an hour!\n");
+  BumpPtrAllocator Allocator;
+  StringSaver Saver(Allocator);
+  MustacheContext Ctx(Allocator, Saver);
+  Template T("{{.}} miles an hour!\n", Ctx);
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -271,7 +341,10 @@ TEST(MustacheInterpolation, ImplicitIteratorsInteger) {
 TEST(MustacheInterpolation, InterpolationSurroundingWhitespace) {
   // Interpolation should not alter surrounding whitespace.
   Value D = Object{{"string", "---"}};
-  Template T("| {{string}} |");
+  BumpPtrAllocator Allocator;
+  StringSaver Saver(Allocator);
+  MustacheContext Ctx(Allocator, Saver);
+  Template T("| {{string}} |", Ctx);
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -281,7 +354,10 @@ TEST(MustacheInterpolation, InterpolationSurroundingWhitespace) {
 TEST(MustacheInterpolation, AmersandSurroundingWhitespace) {
   // Interpolation should not alter surrounding whitespace.
   Value D = Object{{"string", "---"}};
-  Template T("| {{&string}} |");
+  BumpPtrAllocator Allocator;
+  StringSaver Saver(Allocator);
+  MustacheContext Ctx(Allocator, Saver);
+  Template T("| {{&string}} |", Ctx);
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -291,7 +367,10 @@ TEST(MustacheInterpolation, AmersandSurroundingWhitespace) {
 TEST(MustacheInterpolation, StandaloneInterpolationWithWhitespace) {
   // Standalone interpolation should not alter surrounding whitespace.
   Value D = Object{{"string", "---"}};
-  Template T("  {{string}}\n");
+  BumpPtrAllocator Allocator;
+  StringSaver Saver(Allocator);
+  MustacheContext Ctx(Allocator, Saver);
+  Template T("  {{string}}\n", Ctx);
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -301,7 +380,10 @@ TEST(MustacheInterpolation, StandaloneInterpolationWithWhitespace) {
 TEST(MustacheInterpolation, StandaloneAmpersandWithWhitespace) {
   // Standalone interpolation should not alter surrounding whitespace.
   Value D = Object{{"string", "---"}};
-  Template T("  {{&string}}\n");
+  BumpPtrAllocator Allocator;
+  StringSaver Saver(Allocator);
+  MustacheContext Ctx(Allocator, Saver);
+  Template T("  {{&string}}\n", Ctx);
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -311,7 +393,10 @@ TEST(MustacheInterpolation, StandaloneAmpersandWithWhitespace) {
 TEST(MustacheInterpolation, InterpolationWithPadding) {
   // Superfluous in-tag whitespace should be ignored.
   Value D = Object{{"string", "---"}};
-  Template T("|{{ string }}|");
+  BumpPtrAllocator Allocator;
+  StringSaver Saver(Allocator);
+  MustacheContext Ctx(Allocator, Saver);
+  Template T("|{{ string }}|", Ctx);
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -321,7 +406,10 @@ TEST(MustacheInterpolation, InterpolationWithPadding) {
 TEST(MustacheInterpolation, AmpersandWithPadding) {
   // Superfluous in-tag whitespace should be ignored.
   Value D = Object{{"string", "---"}};
-  Template T("|{{& string }}|");
+  BumpPtrAllocator Allocator;
+  StringSaver Saver(Allocator);
+  MustacheContext Ctx(Allocator, Saver);
+  Template T("|{{& string }}|", Ctx);
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -331,7 +419,10 @@ TEST(MustacheInterpolation, AmpersandWithPadding) {
 TEST(MustacheInterpolation, InterpolationWithPaddingAndNewlines) {
   // Superfluous in-tag whitespace should be ignored.
   Value D = Object{{"string", "---"}};
-  Template T("|{{ string \n\n\n }}|");
+  BumpPtrAllocator Allocator;
+  StringSaver Saver(Allocator);
+  MustacheContext Ctx(Allocator, Saver);
+  Template T("|{{ string \n\n\n }}|", Ctx);
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -340,7 +431,10 @@ TEST(MustacheInterpolation, InterpolationWithPaddingAndNewlines) {
 
 TEST(MustacheSections, Truthy) {
   Value D = Object{{"boolean", true}};
-  Template T("{{#boolean}}This should be rendered.{{/boolean}}");
+  BumpPtrAllocator Allocator;
+  StringSaver Saver(Allocator);
+  MustacheContext Ctx(Allocator, Saver);
+  Template T("{{#boolean}}This should be rendered.{{/boolean}}", Ctx);
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -349,7 +443,10 @@ TEST(MustacheSections, Truthy) {
 
 TEST(MustacheSections, Falsey) {
   Value D = Object{{"boolean", false}};
-  Template T("{{#boolean}}This should not be rendered.{{/boolean}}");
+  BumpPtrAllocator Allocator;
+  StringSaver Saver(Allocator);
+  MustacheContext Ctx(Allocator, Saver);
+  Template T("{{#boolean}}This should not be rendered.{{/boolean}}", Ctx);
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -359,7 +456,10 @@ TEST(MustacheSections, Falsey) {
 TEST(MustacheInterpolation, IsFalseyNull) {
   // Mustache-free templates should render as-is.
   Value D = Object{{"boolean", nullptr}};
-  Template T("Hello, {{#boolean}}World{{/boolean}}");
+  BumpPtrAllocator Allocator;
+  StringSaver Saver(Allocator);
+  MustacheContext Ctx(Allocator, Saver);
+  Template T("Hello, {{#boolean}}World{{/boolean}}", Ctx);
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -369,7 +469,10 @@ TEST(MustacheInterpolation, IsFalseyNull) {
 TEST(MustacheInterpolation, IsFalseyArray) {
   // Mustache-free templates should render as-is.
   Value D = Object{{"boolean", Array()}};
-  Template T("Hello, {{#boolean}}World{{/boolean}}");
+  BumpPtrAllocator Allocator;
+  StringSaver Saver(Allocator);
+  MustacheContext Ctx(Allocator, Saver);
+  Template T("Hello, {{#boolean}}World{{/boolean}}", Ctx);
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -379,7 +482,10 @@ TEST(MustacheInterpolation, IsFalseyArray) {
 TEST(MustacheInterpolation, IsFalseyObject) {
   // Mustache-free templates should render as-is.
   Value D = Object{{"boolean", Object{}}};
-  Template T("Hello, {{#boolean}}World{{/boolean}}");
+  BumpPtrAllocator Allocator;
+  StringSaver Saver(Allocator);
+  MustacheContext Ctx(Allocator, Saver);
+  Template T("Hello, {{#boolean}}World{{/boolean}}", Ctx);
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -389,7 +495,10 @@ TEST(MustacheInterpolation, IsFalseyObject) {
 TEST(MustacheInterpolation, DoubleRendering) {
   // Mustache-free templates should render as-is.
   Value D1 = Object{{"subject", "World"}};
-  Template T("Hello, {{subject}}!");
+  BumpPtrAllocator Allocator;
+  StringSaver Saver(Allocator);
+  MustacheContext Ctx(Allocator, Saver);
+  Template T("Hello, {{subject}}!", Ctx);
   std::string Out1;
   raw_string_ostream OS1(Out1);
   T.render(D1, OS1);
@@ -403,7 +512,10 @@ TEST(MustacheInterpolation, DoubleRendering) {
 
 TEST(MustacheSections, NullIsFalsey) {
   Value D = Object{{"null", nullptr}};
-  Template T("{{#null}}This should not be rendered.{{/null}}");
+  BumpPtrAllocator Allocator;
+  StringSaver Saver(Allocator);
+  MustacheContext Ctx(Allocator, Saver);
+  Template T("{{#null}}This should not be rendered.{{/null}}", Ctx);
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -412,7 +524,10 @@ TEST(MustacheSections, NullIsFalsey) {
 
 TEST(MustacheSections, Context) {
   Value D = Object{{"context", Object{{"name", "Joe"}}}};
-  Template T("{{#context}}Hi {{name}}.{{/context}}");
+  BumpPtrAllocator Allocator;
+  StringSaver Saver(Allocator);
+  MustacheContext Ctx(Allocator, Saver);
+  Template T("{{#context}}Hi {{name}}.{{/context}}", Ctx);
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -424,7 +539,10 @@ TEST(MustacheSections, ParentContexts) {
                    {"b", "wrong"},
                    {"sec", Object{{"b", "bar"}}},
                    {"c", Object{{"d", "baz"}}}};
-  Template T("{{#sec}}{{a}}, {{b}}, {{c.d}}{{/sec}}");
+  BumpPtrAllocator Allocator;
+  StringSaver Saver(Allocator);
+  MustacheContext Ctx(Allocator, Saver);
+  Template T("{{#sec}}{{a}}, {{b}}, {{c.d}}{{/sec}}", Ctx);
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -433,7 +551,10 @@ TEST(MustacheSections, ParentContexts) {
 
 TEST(MustacheSections, VariableTest) {
   Value D = Object{{"foo", "bar"}};
-  Template T("{{#foo}}{{.}} is {{foo}}{{/foo}}");
+  BumpPtrAllocator Allocator;
+  StringSaver Saver(Allocator);
+  MustacheContext Ctx(Allocator, Saver);
+  Template T("{{#foo}}{{.}} is {{foo}}{{/foo}}", Ctx);
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -449,6 +570,9 @@ TEST(MustacheSections, ListContexts) {
             Array{Object{{"mname", "1"},
                          {"bottoms", Array{Object{{"bname", "x"}},
                                            Object{{"bname", "y"}}}}}}}}}}};
+  BumpPtrAllocator Allocator;
+  StringSaver Saver(Allocator);
+  MustacheContext Ctx(Allocator, Saver);
   Template T("{{#tops}}"
              "{{#middles}}"
              "{{tname.lower}}{{mname}}."
@@ -456,7 +580,8 @@ TEST(MustacheSections, ListContexts) {
              "{{tname.upper}}{{mname}}{{bname}}."
              "{{/bottoms}}"
              "{{/middles}}"
-             "{{/tops}}");
+             "{{/tops}}",
+             Ctx);
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -468,6 +593,9 @@ TEST(MustacheSections, DeeplyNestedContexts) {
       {"a", Object{{"one", 1}}},
       {"b", Object{{"two", 2}}},
       {"c", Object{{"three", 3}, {"d", Object{{"four", 4}, {"five", 5}}}}}};
+  BumpPtrAllocator Allocator;
+  StringSaver Saver(Allocator);
+  MustacheContext Ctx(Allocator, Saver);
   Template T(
       "{{#a}}\n{{one}}\n{{#b}}\n{{one}}{{two}}{{one}}\n{{#c}}\n{{one}}{{two}}{{"
       "three}}{{two}}{{one}}\n{{#d}}\n{{one}}{{two}}{{three}}{{four}}{{three}}{"
@@ -477,7 +605,8 @@ TEST(MustacheSections, DeeplyNestedContexts) {
       "four}}{{three}}{{two}}{{one}}\n{{/"
       "five}}\n{{one}}{{two}}{{three}}{{four}}{{three}}{{two}}{{one}}\n{{/"
       "d}}\n{{one}}{{two}}{{three}}{{two}}{{one}}\n{{/"
-      "c}}\n{{one}}{{two}}{{one}}\n{{/b}}\n{{one}}\n{{/a}}\n");
+      "c}}\n{{one}}{{two}}{{one}}\n{{/b}}\n{{one}}\n{{/a}}\n",
+      Ctx);
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -489,7 +618,10 @@ TEST(MustacheSections, DeeplyNestedContexts) {
 TEST(MustacheSections, List) {
   Value D = Object{{"list", Array{Object{{"item", 1}}, Object{{"item", 2}},
                                   Object{{"item", 3}}}}};
-  Template T("{{#list}}{{item}}{{/list}}");
+  BumpPtrAllocator Allocator;
+  StringSaver Saver(Allocator);
+  MustacheContext Ctx(Allocator, Saver);
+  Template T("{{#list}}{{item}}{{/list}}", Ctx);
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -498,7 +630,10 @@ TEST(MustacheSections, List) {
 
 TEST(MustacheSections, EmptyList) {
   Value D = Object{{"list", Array{}}};
-  Template T("{{#list}}Yay lists!{{/list}}");
+  BumpPtrAllocator Allocator;
+  StringSaver Saver(Allocator);
+  MustacheContext Ctx(Allocator, Saver);
+  Template T("{{#list}}Yay lists!{{/list}}", Ctx);
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -507,8 +642,12 @@ TEST(MustacheSections, EmptyList) {
 
 TEST(MustacheSections, Doubled) {
   Value D = Object{{"bool", true}, {"two", "second"}};
+  BumpPtrAllocator Allocator;
+  StringSaver Saver(Allocator);
+  MustacheContext Ctx(Allocator, Saver);
   Template T("{{#bool}}\n* first\n{{/bool}}\n* "
-             "{{two}}\n{{#bool}}\n* third\n{{/bool}}\n");
+             "{{two}}\n{{#bool}}\n* third\n{{/bool}}\n",
+             Ctx);
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -517,7 +656,10 @@ TEST(MustacheSections, Doubled) {
 
 TEST(MustacheSections, NestedTruthy) {
   Value D = Object{{"bool", true}};
-  Template T("| A {{#bool}}B {{#bool}}C{{/bool}} D{{/bool}} E |");
+  BumpPtrAllocator Allocator;
+  StringSaver Saver(Allocator);
+  MustacheContext Ctx(Allocator, Saver);
+  Template T("| A {{#bool}}B {{#bool}}C{{/bool}} D{{/bool}} E |", Ctx);
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -526,7 +668,10 @@ TEST(MustacheSections, NestedTruthy) {
 
 TEST(MustacheSections, NestedFalsey) {
   Value D = Object{{"bool", false}};
-  Template T("| A {{#bool}}B {{#bool}}C{{/bool}} D{{/bool}} E |");
+  BumpPtrAllocator Allocator;
+  StringSaver Saver(Allocator);
+  MustacheContext Ctx(Allocator, Saver);
+  Template T("| A {{#bool}}B {{#bool}}C{{/bool}} D{{/bool}} E |", Ctx);
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -535,7 +680,10 @@ TEST(MustacheSections, NestedFalsey) {
 
 TEST(MustacheSections, ContextMisses) {
   Value D = Object{};
-  Template T("[{{#missing}}Found key 'missing'!{{/missing}}]");
+  BumpPtrAllocator Allocator;
+  StringSaver Saver(Allocator);
+  MustacheContext Ctx(Allocator, Saver);
+  Template T("[{{#missing}}Found key 'missing'!{{/missing}}]", Ctx);
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -544,7 +692,10 @@ TEST(MustacheSections, ContextMisses) {
 
 TEST(MustacheSections, ImplicitIteratorString) {
   Value D = Object{{"list", Array{"a", "b", "c", "d", "e"}}};
-  Template T("{{#list}}({{.}}){{/list}}");
+  BumpPtrAllocator Allocator;
+  StringSaver Saver(Allocator);
+  MustacheContext Ctx(Allocator, Saver);
+  Template T("{{#list}}({{.}}){{/list}}", Ctx);
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -553,7 +704,10 @@ TEST(MustacheSections, ImplicitIteratorString) {
 
 TEST(MustacheSections, ImplicitIteratorInteger) {
   Value D = Object{{"list", Array{1, 2, 3, 4, 5}}};
-  Template T("{{#list}}({{.}}){{/list}}");
+  BumpPtrAllocator Allocator;
+  StringSaver Saver(Allocator);
+  MustacheContext Ctx(Allocator, Saver);
+  Template T("{{#list}}({{.}}){{/list}}", Ctx);
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -562,7 +716,10 @@ TEST(MustacheSections, ImplicitIteratorInteger) {
 
 TEST(MustacheSections, ImplicitIteratorArray) {
   Value D = Object{{"list", Array{Array{1, 2, 3}, Array{"a", "b", "c"}}}};
-  Template T("{{#list}}({{#.}}{{.}}{{/.}}){{/list}}");
+  BumpPtrAllocator Allocator;
+  StringSaver Saver(Allocator);
+  MustacheContext Ctx(Allocator, Saver);
+  Template T("{{#list}}({{#.}}{{.}}{{/.}}){{/list}}", Ctx);
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -571,7 +728,10 @@ TEST(MustacheSections, ImplicitIteratorArray) {
 
 TEST(MustacheSections, ImplicitIteratorHTMLEscaping) {
   Value D = Object{{"list", Array{"&", "\"", "<", ">"}}};
-  Template T("{{#list}}({{.}}){{/list}}");
+  BumpPtrAllocator Allocator;
+  StringSaver Saver(Allocator);
+  MustacheContext Ctx(Allocator, Saver);
+  Template T("{{#list}}({{.}}){{/list}}", Ctx);
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -580,7 +740,10 @@ TEST(MustacheSections, ImplicitIteratorHTMLEscaping) {
 
 TEST(MustacheSections, ImplicitIteratorAmpersand) {
   Value D = Object{{"list", Array{"&", "\"", "<", ">"}}};
-  Template T("{{#list}}({{&.}}){{/list}}");
+  BumpPtrAllocator Allocator;
+  StringSaver Saver(Allocator);
+  MustacheContext Ctx(Allocator, Saver);
+  Template T("{{#list}}({{&.}}){{/list}}", Ctx);
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -589,7 +752,10 @@ TEST(MustacheSections, ImplicitIteratorAmpersand) {
 
 TEST(MustacheSections, ImplicitIteratorRootLevel) {
   Value D = Array{Object{{"value", "a"}}, Object{{"value", "b"}}};
-  Template T("{{#.}}({{value}}){{/.}}");
+  BumpPtrAllocator Allocator;
+  StringSaver Saver(Allocator);
+  MustacheContext Ctx(Allocator, Saver);
+  Template T("{{#.}}({{value}}){{/.}}", Ctx);
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -598,7 +764,10 @@ TEST(MustacheSections, ImplicitIteratorRootLevel) {
 
 TEST(MustacheSections, DottedNamesTruthy) {
   Value D = Object{{"a", Object{{"b", Object{{"c", true}}}}}};
-  Template T("{{#a.b.c}}Here{{/a.b.c}} == Here");
+  BumpPtrAllocator Allocator;
+  StringSaver Saver(Allocator);
+  MustacheContext Ctx(Allocator, Saver);
+  Template T("{{#a.b.c}}Here{{/a.b.c}} == Here", Ctx);
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -607,7 +776,10 @@ TEST(MustacheSections, DottedNamesTruthy) {
 
 TEST(MustacheSections, DottedNamesFalsey) {
   Value D = Object{{"a", Object{{"b", Object{{"c", false}}}}}};
-  Template T("{{#a.b.c}}Here{{/a.b.c}} == ");
+  BumpPtrAllocator Allocator;
+  StringSaver Saver(Allocator);
+  MustacheContext Ctx(Allocator, Saver);
+  Template T("{{#a.b.c}}Here{{/a.b.c}} == ", Ctx);
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -615,8 +787,11 @@ TEST(MustacheSections, DottedNamesFalsey) {
 }
 
 TEST(MustacheSections, DottedNamesBrokenChains) {
-  Value D = Object{{"a", Object{}}};
-  Template T("{{#a.b.c}}Here{{/a.b.c}} == ");
+  Value D = Object{{"a", Object{{}}}};
+  BumpPtrAllocator Allocator;
+  StringSaver Saver(Allocator);
+  MustacheContext Ctx(Allocator, Saver);
+  Template T("{{#a.b.c}}Here{{/a.b.c}} == ", Ctx);
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -625,7 +800,10 @@ TEST(MustacheSections, DottedNamesBrokenChains) {
 
 TEST(MustacheSections, SurroundingWhitespace) {
   Value D = Object{{"boolean", true}};
-  Template T(" | {{#boolean}}\t|\t{{/boolean}} | \n");
+  BumpPtrAllocator Allocator;
+  StringSaver Saver(Allocator);
+  MustacheContext Ctx(Allocator, Saver);
+  Template T(" | {{#boolean}}\t|\t{{/boolean}} | \n", Ctx);
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -634,7 +812,11 @@ TEST(MustacheSections, SurroundingWhitespace) {
 
 TEST(MustacheSections, InternalWhitespace) {
   Value D = Object{{"boolean", true}};
-  Template T(" | {{#boolean}} {{! Important Whitespace }}\n {{/boolean}} | \n");
+  BumpPtrAllocator Allocator;
+  StringSaver Saver(Allocator);
+  MustacheContext Ctx(Allocator, Saver);
+  Template T(" | {{#boolean}} {{! Important Whitespace }}\n {{/boolean}} | \n",
+             Ctx);
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -643,7 +825,11 @@ TEST(MustacheSections, InternalWhitespace) {
 
 TEST(MustacheSections, IndentedInlineSections) {
   Value D = Object{{"boolean", true}};
-  Template T(" {{#boolean}}YES{{/boolean}}\n {{#boolean}}GOOD{{/boolean}}\n");
+  BumpPtrAllocator Allocator;
+  StringSaver Saver(Allocator);
+  MustacheContext Ctx(Allocator, Saver);
+  Template T(" {{#boolean}}YES{{/boolean}}\n {{#boolean}}GOOD{{/boolean}}\n",
+             Ctx);
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -652,7 +838,10 @@ TEST(MustacheSections, IndentedInlineSections) {
 
 TEST(MustacheSections, StandaloneLines) {
   Value D = Object{{"boolean", true}};
-  Template T("| This Is\n{{#boolean}}\n|\n{{/boolean}}\n| A Line\n");
+  BumpPtrAllocator Allocator;
+  StringSaver Saver(Allocator);
+  MustacheContext Ctx(Allocator, Saver);
+  Template T("| This Is\n{{#boolean}}\n|\n{{/boolean}}\n| A Line\n", Ctx);
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -661,7 +850,10 @@ TEST(MustacheSections, StandaloneLines) {
 
 TEST(MustacheSections, IndentedStandaloneLines) {
   Value D = Object{{"boolean", true}};
-  Template T("| This Is\n  {{#boolean}}\n|\n  {{/boolean}}\n| A Line\n");
+  BumpPtrAllocator Allocator;
+  StringSaver Saver(Allocator);
+  MustacheContext Ctx(Allocator, Saver);
+  Template T("| This Is\n  {{#boolean}}\n|\n  {{/boolean}}\n| A Line\n", Ctx);
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -670,7 +862,10 @@ TEST(MustacheSections, IndentedStandaloneLines) {
 
 TEST(MustacheSections, StandaloneLineEndings) {
   Value D = Object{{"boolean", true}};
-  Template T("|\r\n{{#boolean}}\r\n{{/boolean}}\r\n|");
+  BumpPtrAllocator Allocator;
+  StringSaver Saver(Allocator);
+  MustacheContext Ctx(Allocator, Saver);
+  Template T("|\r\n{{#boolean}}\r\n{{/boolean}}\r\n|", Ctx);
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -679,7 +874,10 @@ TEST(MustacheSections, StandaloneLineEndings) {
 
 TEST(MustacheSections, StandaloneWithoutPreviousLine) {
   Value D = Object{{"boolean", true}};
-  Template T("  {{#boolean}}\n#{{/boolean}}\n/");
+  BumpPtrAllocator Allocator;
+  StringSaver Saver(Allocator);
+  MustacheContext Ctx(Allocator, Saver);
+  Template T("  {{#boolean}}\n#{{/boolean}}\n/", Ctx);
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -688,7 +886,10 @@ TEST(MustacheSections, StandaloneWithoutPreviousLine) {
 
 TEST(MustacheSections, StandaloneWithoutNewline) {
   Value D = Object{{"boolean", true}};
-  Template T("#{{#boolean}}\n/\n  {{/boolean}}");
+  BumpPtrAllocator Allocator;
+  StringSaver Saver(Allocator);
+  MustacheContext Ctx(Allocator, Saver);
+  Template T("#{{#boolean}}\n/\n  {{/boolean}}", Ctx);
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -697,7 +898,10 @@ TEST(MustacheSections, StandaloneWithoutNewline) {
 
 TEST(MustacheSections, Padding) {
   Value D = Object{{"boolean", true}};
-  Template T("|{{# boolean }}={{/ boolean }}|");
+  BumpPtrAllocator Allocator;
+  StringSaver Saver(Allocator);
+  MustacheContext Ctx(Allocator, Saver);
+  Template T("|{{# boolean }}={{/ boolean }}|", Ctx);
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -706,7 +910,10 @@ TEST(MustacheSections, Padding) {
 
 TEST(MustacheInvertedSections, Falsey) {
   Value D = Object{{"boolean", false}};
-  Template T("{{^boolean}}This should be rendered.{{/boolean}}");
+  BumpPtrAllocator Allocator;
+  StringSaver Saver(Allocator);
+  MustacheContext Ctx(Allocator, Saver);
+  Template T("{{^boolean}}This should be rendered.{{/boolean}}", Ctx);
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -715,7 +922,10 @@ TEST(MustacheInvertedSections, Falsey) {
 
 TEST(MustacheInvertedSections, Truthy) {
   Value D = Object{{"boolean", true}};
-  Template T("{{^boolean}}This should not be rendered.{{/boolean}}");
+  BumpPtrAllocator Allocator;
+  StringSaver Saver(Allocator);
+  MustacheContext Ctx(Allocator, Saver);
+  Template T("{{^boolean}}This should not be rendered.{{/boolean}}", Ctx);
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -724,7 +934,10 @@ TEST(MustacheInvertedSections, Truthy) {
 
 TEST(MustacheInvertedSections, NullIsFalsey) {
   Value D = Object{{"null", nullptr}};
-  Template T("{{^null}}This should be rendered.{{/null}}");
+  BumpPtrAllocator Allocator;
+  StringSaver Saver(Allocator);
+  MustacheContext Ctx(Allocator, Saver);
+  Template T("{{^null}}This should be rendered.{{/null}}", Ctx);
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -733,7 +946,10 @@ TEST(MustacheInvertedSections, NullIsFalsey) {
 
 TEST(MustacheInvertedSections, Context) {
   Value D = Object{{"context", Object{{"name", "Joe"}}}};
-  Template T("{{^context}}Hi {{name}}.{{/context}}");
+  BumpPtrAllocator Allocator;
+  StringSaver Saver(Allocator);
+  MustacheContext Ctx(Allocator, Saver);
+  Template T("{{^context}}Hi {{name}}.{{/context}}", Ctx);
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -743,7 +959,10 @@ TEST(MustacheInvertedSections, Context) {
 TEST(MustacheInvertedSections, List) {
   Value D = Object{
       {"list", Array{Object{{"n", 1}}, Object{{"n", 2}}, Object{{"n", 3}}}}};
-  Template T("{{^list}}{{n}}{{/list}}");
+  BumpPtrAllocator Allocator;
+  StringSaver Saver(Allocator);
+  MustacheContext Ctx(Allocator, Saver);
+  Template T("{{^list}}{{n}}{{/list}}", Ctx);
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -752,7 +971,10 @@ TEST(MustacheInvertedSections, List) {
 
 TEST(MustacheInvertedSections, EmptyList) {
   Value D = Object{{"list", Array{}}};
-  Template T("{{^list}}Yay lists!{{/list}}");
+  BumpPtrAllocator Allocator;
+  StringSaver Saver(Allocator);
+  MustacheContext Ctx(Allocator, Saver);
+  Template T("{{^list}}Yay lists!{{/list}}", Ctx);
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -761,8 +983,12 @@ TEST(MustacheInvertedSections, EmptyList) {
 
 TEST(MustacheInvertedSections, Doubled) {
   Value D = Object{{"bool", false}, {"two", "second"}};
+  BumpPtrAllocator Allocator;
+  StringSaver Saver(Allocator);
+  MustacheContext Ctx(Allocator, Saver);
   Template T("{{^bool}}\n* first\n{{/bool}}\n* "
-             "{{two}}\n{{^bool}}\n* third\n{{/bool}}\n");
+             "{{two}}\n{{^bool}}\n* third\n{{/bool}}\n",
+             Ctx);
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -771,7 +997,10 @@ TEST(MustacheInvertedSections, Doubled) {
 
 TEST(MustacheInvertedSections, NestedFalsey) {
   Value D = Object{{"bool", false}};
-  Template T("| A {{^bool}}B {{^bool}}C{{/bool}} D{{/bool}} E |");
+  BumpPtrAllocator Allocator;
+  StringSaver Saver(Allocator);
+  MustacheContext Ctx(Allocator, Saver);
+  Template T("| A {{^bool}}B {{^bool}}C{{/bool}} D{{/bool}} E |", Ctx);
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -780,7 +1009,10 @@ TEST(MustacheInvertedSections, NestedFalsey) {
 
 TEST(MustacheInvertedSections, NestedTruthy) {
   Value D = Object{{"bool", true}};
-  Template T("| A {{^bool}}B {{^bool}}C{{/bool}} D{{/bool}} E |");
+  BumpPtrAllocator Allocator;
+  StringSaver Saver(Allocator);
+  MustacheContext Ctx(Allocator, Saver);
+  Template T("| A {{^bool}}B {{^bool}}C{{/bool}} D{{/bool}} E |", Ctx);
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -789,7 +1021,10 @@ TEST(MustacheInvertedSections, NestedTruthy) {
 
 TEST(MustacheInvertedSections, ContextMisses) {
   Value D = Object{};
-  Template T("[{{^missing}}Cannot find key 'missing'!{{/missing}}]");
+  BumpPtrAllocator Allocator;
+  StringSaver Saver(Allocator);
+  MustacheContext Ctx(Allocator, Saver);
+  Template T("[{{^missing}}Cannot find key 'missing'!{{/missing}}]", Ctx);
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -798,7 +1033,10 @@ TEST(MustacheInvertedSections, ContextMisses) {
 
 TEST(MustacheInvertedSections, DottedNamesTruthy) {
   Value D = Object{{"a", Object{{"b", Object{{"c", true}}}}}};
-  Template T("{{^a.b.c}}Not Here{{/a.b.c}} == ");
+  BumpPtrAllocator Allocator;
+  StringSaver Saver(Allocator);
+  MustacheContext Ctx(Allocator, Saver);
+  Template T("{{^a.b.c}}Not Here{{/a.b.c}} == ", Ctx);
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -807,7 +1045,10 @@ TEST(MustacheInvertedSections, DottedNamesTruthy) {
 
 TEST(MustacheInvertedSections, DottedNamesFalsey) {
   Value D = Object{{"a", Object{{"b", Object{{"c", false}}}}}};
-  Template T("{{^a.b.c}}Not Here{{/a.b.c}} == Not Here");
+  BumpPtrAllocator Allocator;
+  StringSaver Saver(Allocator);
+  MustacheContext Ctx(Allocator, Saver);
+  Template T("{{^a.b.c}}Not Here{{/a.b.c}} == Not Here", Ctx);
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -816,7 +1057,10 @@ TEST(MustacheInvertedSections, DottedNamesFalsey) {
 
 TEST(MustacheInvertedSections, DottedNamesBrokenChains) {
   Value D = Object{{"a", Object{}}};
-  Template T("{{^a.b.c}}Not Here{{/a.b.c}} == Not Here");
+  BumpPtrAllocator Allocator;
+  StringSaver Saver(Allocator);
+  MustacheContext Ctx(Allocator, Saver);
+  Template T("{{^a.b.c}}Not Here{{/a.b.c}} == Not Here", Ctx);
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -825,7 +1069,10 @@ TEST(MustacheInvertedSections, DottedNamesBrokenChains) {
 
 TEST(MustacheInvertedSections, SurroundingWhitespace) {
   Value D = Object{{"boolean", false}};
-  Template T(" | {{^boolean}}\t|\t{{/boolean}} | \n");
+  BumpPtrAllocator Allocator;
+  StringSaver Saver(Allocator);
+  MustacheContext Ctx(Allocator, Saver);
+  Template T(" | {{^boolean}}\t|\t{{/boolean}} | \n", Ctx);
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -834,7 +1081,11 @@ TEST(MustacheInvertedSections, SurroundingWhitespace) {
 
 TEST(MustacheInvertedSections, InternalWhitespace) {
   Value D = Object{{"boolean", false}};
-  Template T(" | {{^boolean}} {{! Important Whitespace }}\n {{/boolean}} | \n");
+  BumpPtrAllocator Allocator;
+  StringSaver Saver(Allocator);
+  MustacheContext Ctx(Allocator, Saver);
+  Template T(" | {{^boolean}} {{! Important Whitespace }}\n {{/boolean}} | \n",
+             Ctx);
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -843,7 +1094,11 @@ TEST(MustacheInvertedSections, InternalWhitespace) {
 
 TEST(MustacheInvertedSections, IndentedInlineSections) {
   Value D = Object{{"boolean", false}};
-  Template T(" {{^boolean}}NO{{/boolean}}\n {{^boolean}}WAY{{/boolean}}\n");
+  BumpPtrAllocator Allocator;
+  StringSaver Saver(Allocator);
+  MustacheContext Ctx(Allocator, Saver);
+  Template T(" {{^boolean}}NO{{/boolean}}\n {{^boolean}}WAY{{/boolean}}\n",
+             Ctx);
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -852,7 +1107,10 @@ TEST(MustacheInvertedSections, IndentedInlineSections) {
 
 TEST(MustacheInvertedSections, StandaloneLines) {
   Value D = Object{{"boolean", false}};
-  Template T("| This Is\n{{^boolean}}\n|\n{{/boolean}}\n| A Line\n");
+  BumpPtrAllocator Allocator;
+  StringSaver Saver(Allocator);
+  MustacheContext Ctx(Allocator, Saver);
+  Template T("| This Is\n{{^boolean}}\n|\n{{/boolean}}\n| A Line\n", Ctx);
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -861,7 +1119,10 @@ TEST(MustacheInvertedSections, StandaloneLines) {
 
 TEST(MustacheInvertedSections, StandaloneIndentedLines) {
   Value D = Object{{"boolean", false}};
-  Template T("| This Is\n  {{^boolean}}\n|\n  {{/boolean}}\n| A Line\n");
+  BumpPtrAllocator Allocator;
+  StringSaver Saver(Allocator);
+  MustacheContext Ctx(Allocator, Saver);
+  Template T("| This Is\n  {{^boolean}}\n|\n  {{/boolean}}\n| A Line\n", Ctx);
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -870,7 +1131,10 @@ TEST(MustacheInvertedSections, StandaloneIndentedLines) {
 
 TEST(MustacheInvertedSections, StandaloneLineEndings) {
   Value D = Object{{"boolean", false}};
-  Template T("|\r\n{{^boolean}}\r\n{{/boolean}}\r\n|");
+  BumpPtrAllocator Allocator;
+  StringSaver Saver(Allocator);
+  MustacheContext Ctx(Allocator, Saver);
+  Template T("|\r\n{{^boolean}}\r\n{{/boolean}}\r\n|", Ctx);
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -879,7 +1143,10 @@ TEST(MustacheInvertedSections, StandaloneLineEndings) {
 
 TEST(MustacheInvertedSections, StandaloneWithoutPreviousLine) {
   Value D = Object{{"boolean", false}};
-  Template T("  {{^boolean}}\n^{{/boolean}}\n/");
+  BumpPtrAllocator Allocator;
+  StringSaver Saver(Allocator);
+  MustacheContext Ctx(Allocator, Saver);
+  Template T("  {{^boolean}}\n^{{/boolean}}\n/", Ctx);
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -888,7 +1155,10 @@ TEST(MustacheInvertedSections, StandaloneWithoutPreviousLine) {
 
 TEST(MustacheInvertedSections, StandaloneWithoutNewline) {
   Value D = Object{{"boolean", false}};
-  Template T("^{{^boolean}}\n/\n  {{/boolean}}");
+  BumpPtrAllocator Allocator;
+  StringSaver Saver(Allocator);
+  MustacheContext Ctx(Allocator, Saver);
+  Template T("^{{^boolean}}\n/\n  {{/boolean}}", Ctx);
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -897,7 +1167,10 @@ TEST(MustacheInvertedSections, StandaloneWithoutNewline) {
 
 TEST(MustacheInvertedSections, Padding) {
   Value D = Object{{"boolean", false}};
-  Template T("|{{^ boolean }}={{/ boolean }}|");
+  BumpPtrAllocator Allocator;
+  StringSaver Saver(Allocator);
+  MustacheContext Ctx(Allocator, Saver);
+  Template T("|{{^ boolean }}={{/ boolean }}|", Ctx);
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -906,7 +1179,10 @@ TEST(MustacheInvertedSections, Padding) {
 
 TEST(MustachePartials, BasicBehavior) {
   Value D = Object{};
-  Template T("{{>text}}");
+  BumpPtrAllocator Allocator;
+  StringSaver Saver(Allocator);
+  MustacheContext Ctx(Allocator, Saver);
+  Template T("{{>text}}", Ctx);
   T.registerPartial("text", "from partial");
   std::string Out;
   raw_string_ostream OS(Out);
@@ -916,7 +1192,10 @@ TEST(MustachePartials, BasicBehavior) {
 
 TEST(MustachePartials, FailedLookup) {
   Value D = Object{};
-  Template T("{{>text}}");
+  BumpPtrAllocator Allocator;
+  StringSaver Saver(Allocator);
+  MustacheContext Ctx(Allocator, Saver);
+  Template T("{{>text}}", Ctx);
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -925,7 +1204,10 @@ TEST(MustachePartials, FailedLookup) {
 
 TEST(MustachePartials, Context) {
   Value D = Object{{"text", "content"}};
-  Template T("{{>partial}}");
+  BumpPtrAllocator Allocator;
+  StringSaver Saver(Allocator);
+  MustacheContext Ctx(Allocator, Saver);
+  Template T("{{>partial}}", Ctx);
   T.registerPartial("partial", "*{{text}}*");
   std::string Out;
   raw_string_ostream OS(Out);
@@ -937,7 +1219,10 @@ TEST(MustachePartials, Recursion) {
   Value D =
       Object{{"content", "X"},
              {"nodes", Array{Object{{"content", "Y"}, {"nodes", Array{}}}}}};
-  Template T("{{>node}}");
+  BumpPtrAllocator Allocator;
+  StringSaver Saver(Allocator);
+  MustacheContext Ctx(Allocator, Saver);
+  Template T("{{>node}}", Ctx);
   T.registerPartial("node", "{{content}}({{#nodes}}{{>node}}{{/nodes}})");
   std::string Out;
   raw_string_ostream OS(Out);
@@ -947,7 +1232,10 @@ TEST(MustachePartials, Recursion) {
 
 TEST(MustachePartials, Nested) {
   Value D = Object{{"a", "hello"}, {"b", "world"}};
-  Template T("{{>outer}}");
+  BumpPtrAllocator Allocator;
+  StringSaver Saver(Allocator);
+  MustacheContext Ctx(Allocator, Saver);
+  Template T("{{>outer}}", Ctx);
   T.registerPartial("outer", "*{{a}} {{>inner}}*");
   T.registerPartial("inner", "{{b}}!");
   std::string Out;
@@ -958,7 +1246,10 @@ TEST(MustachePartials, Nested) {
 
 TEST(MustachePartials, SurroundingWhitespace) {
   Value D = Object{};
-  Template T("| {{>partial}} |");
+  BumpPtrAllocator Allocator;
+  StringSaver Saver(Allocator);
+  MustacheContext Ctx(Allocator, Saver);
+  Template T("| {{>partial}} |", Ctx);
   T.registerPartial("partial", "\t|\t");
   std::string Out;
   raw_string_ostream OS(Out);
@@ -968,7 +1259,10 @@ TEST(MustachePartials, SurroundingWhitespace) {
 
 TEST(MustachePartials, InlineIndentation) {
   Value D = Object{{"data", "|"}};
-  Template T("  {{data}}  {{> partial}}\n");
+  BumpPtrAllocator Allocator;
+  StringSaver Saver(Allocator);
+  MustacheContext Ctx(Allocator, Saver);
+  Template T("  {{data}}  {{> partial}}\n", Ctx);
   T.registerPartial("partial", "<\n<");
   std::string Out;
   raw_string_ostream OS(Out);
@@ -978,7 +1272,10 @@ TEST(MustachePartials, InlineIndentation) {
 
 TEST(MustachePartials, PaddingWhitespace) {
   Value D = Object{{"boolean", true}};
-  Template T("|{{> partial }}|");
+  BumpPtrAllocator Allocator;
+  StringSaver Saver(Allocator);
+  MustacheContext Ctx(Allocator, Saver);
+  Template T("|{{> partial }}|", Ctx);
   T.registerPartial("partial", "[]");
   std::string Out;
   raw_string_ostream OS(Out);
@@ -987,7 +1284,10 @@ TEST(MustachePartials, PaddingWhitespace) {
 }
 
 TEST(MustachePartials, StandaloneIndentation) {
-  mustache::Template T("\\\n {{>partial}}\n/\n");
+  BumpPtrAllocator Allocator;
+  StringSaver Saver(Allocator);
+  MustacheContext Ctx(Allocator, Saver);
+  mustache::Template T("\\\n {{>partial}}\n/\n", Ctx);
   T.registerPartial("partial", "|\n{{{content}}}\n|\n");
   std::string O;
   raw_string_ostream OS(O);
@@ -998,7 +1298,10 @@ TEST(MustachePartials, StandaloneIndentation) {
 
 TEST(MustacheLambdas, BasicInterpolation) {
   Value D = Object{};
-  Template T("Hello, {{lambda}}!");
+  BumpPtrAllocator Allocator;
+  StringSaver Saver(Allocator);
+  MustacheContext Ctx(Allocator, Saver);
+  Template T("Hello, {{lambda}}!", Ctx);
   Lambda L = []() -> llvm::json::Value { return "World"; };
   T.registerLambda("lambda", L);
   std::string Out;
@@ -1009,7 +1312,10 @@ TEST(MustacheLambdas, BasicInterpolation) {
 
 TEST(MustacheLambdas, InterpolationExpansion) {
   Value D = Object{{"planet", "World"}};
-  Template T("Hello, {{lambda}}!");
+  BumpPtrAllocator Allocator;
+  StringSaver Saver(Allocator);
+  MustacheContext Ctx(Allocator, Saver);
+  Template T("Hello, {{lambda}}!", Ctx);
   Lambda L = []() -> llvm::json::Value { return "{{planet}}"; };
   T.registerLambda("lambda", L);
   std::string Out;
@@ -1020,7 +1326,10 @@ TEST(MustacheLambdas, InterpolationExpansion) {
 
 TEST(MustacheLambdas, BasicMultipleCalls) {
   Value D = Object{};
-  Template T("{{lambda}} == {{lambda}} == {{lambda}}");
+  BumpPtrAllocator Allocator;
+  StringSaver Saver(Allocator);
+  MustacheContext Ctx(Allocator, Saver);
+  Template T("{{lambda}} == {{lambda}} == {{lambda}}", Ctx);
   int I = 0;
   Lambda L = [&I]() -> llvm::json::Value {
     I += 1;
@@ -1035,7 +1344,10 @@ TEST(MustacheLambdas, BasicMultipleCalls) {
 
 TEST(MustacheLambdas, Escaping) {
   Value D = Object{};
-  Template T("<{{lambda}}{{&lambda}}");
+  BumpPtrAllocator Allocator;
+  StringSaver Saver(Allocator);
+  MustacheContext Ctx(Allocator, Saver);
+  Template T("<{{lambda}}{{&lambda}}", Ctx);
   Lambda L = []() -> llvm::json::Value { return ">"; };
   T.registerLambda("lambda", L);
   std::string Out;
@@ -1046,7 +1358,10 @@ TEST(MustacheLambdas, Escaping) {
 
 TEST(MustacheLambdas, Sections) {
   Value D = Object{};
-  Template T("<{{#lambda}}{{x}}{{/lambda}}>");
+  BumpPtrAllocator Allocator;
+  StringSaver Saver(Allocator);
+  MustacheContext Ctx(Allocator, Saver);
+  Template T("<{{#lambda}}{{x}}{{/lambda}}>", Ctx);
   SectionLambda L = [](StringRef Text) -> llvm::json::Value {
     if (Text == "{{x}}") {
       return "yes";
@@ -1064,7 +1379,10 @@ TEST(MustacheLambdas, SectionExpansion) {
   Value D = Object{
       {"planet", "Earth"},
   };
-  Template T("<{{#lambda}}-{{/lambda}}>");
+  BumpPtrAllocator Allocator;
+  StringSaver Saver(Allocator);
+  MustacheContext Ctx(Allocator, Saver);
+  Template T("<{{#lambda}}-{{/lambda}}>", Ctx);
   SectionLambda L = [](StringRef Text) -> llvm::json::Value {
     SmallString<128> Result;
     Result += Text;
@@ -1081,7 +1399,10 @@ TEST(MustacheLambdas, SectionExpansion) {
 
 TEST(MustacheLambdas, SectionsMultipleCalls) {
   Value D = Object{};
-  Template T("{{#lambda}}FILE{{/lambda}} != {{#lambda}}LINE{{/lambda}}");
+  BumpPtrAllocator Allocator;
+  StringSaver Saver(Allocator);
+  MustacheContext Ctx(Allocator, Saver);
+  Template T("{{#lambda}}FILE{{/lambda}} != {{#lambda}}LINE{{/lambda}}", Ctx);
   SectionLambda L = [](StringRef Text) -> llvm::json::Value {
     SmallString<128> Result;
     Result += "__";
@@ -1098,7 +1419,10 @@ TEST(MustacheLambdas, SectionsMultipleCalls) {
 
 TEST(MustacheLambdas, InvertedSections) {
   Value D = Object{{"static", "static"}};
-  Template T("<{{^lambda}}{{static}}{{/lambda}}>");
+  BumpPtrAllocator Allocator;
+  StringSaver Saver(Allocator);
+  MustacheContext Ctx(Allocator, Saver);
+  Template T("<{{^lambda}}{{static}}{{/lambda}}>", Ctx);
   SectionLambda L = [](StringRef Text) -> llvm::json::Value { return false; };
   T.registerLambda("lambda", L);
   std::string Out;
@@ -1110,7 +1434,10 @@ TEST(MustacheLambdas, InvertedSections) {
 TEST(MustacheComments, Inline) {
   // Comment blocks should be removed from the template.
   Value D = {};
-  Template T("12345{{! Comment Block! }}67890");
+  BumpPtrAllocator Allocator;
+  StringSaver Saver(Allocator);
+  MustacheContext Ctx(Allocator, Saver);
+  Template T("12345{{! Comment Block! }}67890", Ctx);
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -1120,7 +1447,10 @@ TEST(MustacheComments, Inline) {
 TEST(MustacheComments, Multiline) {
   // Multiline comments should be permitted.
   Value D = {};
-  Template T("12345{{!\n  This is a\n  multi-line comment...\n}}67890\n");
+  BumpPtrAllocator Allocator;
+  StringSaver Saver(Allocator);
+  MustacheContext Ctx(Allocator, Saver);
+  Template T("12345{{!\n  This is a\n  multi-line comment...\n}}67890\n", Ctx);
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -1130,7 +1460,10 @@ TEST(MustacheComments, Multiline) {
 TEST(MustacheComments, Standalone) {
   // All standalone comment lines should be removed.
   Value D = {};
-  Template T("Begin.\n{{! Comment Block! }}\nEnd.\n");
+  BumpPtrAllocator Allocator;
+  StringSaver Saver(Allocator);
+  MustacheContext Ctx(Allocator, Saver);
+  Template T("Begin.\n{{! Comment Block! }}\nEnd.\n", Ctx);
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -1140,7 +1473,10 @@ TEST(MustacheComments, Standalone) {
 TEST(MustacheComments, IndentedStandalone) {
   // All standalone comment lines should be removed.
   Value D = {};
-  Template T("Begin.\n  {{! Indented Comment Block! }}\nEnd.\n");
+  BumpPtrAllocator Allocator;
+  StringSaver Saver(Allocator);
+  MustacheContext Ctx(Allocator, Saver);
+  Template T("Begin.\n  {{! Indented Comment Block! }}\nEnd.\n", Ctx);
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -1150,7 +1486,10 @@ TEST(MustacheComments, IndentedStandalone) {
 TEST(MustacheComments, StandaloneLineEndings) {
   // "\r\n" should be considered a newline for standalone tags.
   Value D = {};
-  Template T("|\r\n{{! Standalone Comment }}\r\n|");
+  BumpPtrAllocator Allocator;
+  StringSaver Saver(Allocator);
+  MustacheContext Ctx(Allocator, Saver);
+  Template T("|\r\n{{! Standalone Comment }}\r\n|", Ctx);
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -1160,7 +1499,10 @@ TEST(MustacheComments, StandaloneLineEndings) {
 TEST(MustacheComments, StandaloneWithoutPreviousLine) {
   // Standalone tags should not require a newline to precede them.
   Value D = {};
-  Template T("  {{! I'm Still Standalone }}\n!");
+  BumpPtrAllocator Allocator;
+  StringSaver Saver(Allocator);
+  MustacheContext Ctx(Allocator, Saver);
+  Template T("  {{! I'm Still Standalone }}\n!", Ctx);
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -1170,7 +1512,10 @@ TEST(MustacheComments, StandaloneWithoutPreviousLine) {
 TEST(MustacheComments, StandaloneWithoutNewline) {
   // Standalone tags should not require a newline to follow them.
   Value D = {};
-  Template T("!\n  {{! I'm Still Standalone }}");
+  BumpPtrAllocator Allocator;
+  StringSaver Saver(Allocator);
+  MustacheContext Ctx(Allocator, Saver);
+  Template T("!\n  {{! I'm Still Standalone }}", Ctx);
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -1180,7 +1525,10 @@ TEST(MustacheComments, StandaloneWithoutNewline) {
 TEST(MustacheComments, MultilineStandalone) {
   // All standalone comment lines should be removed.
   Value D = {};
-  Template T("Begin.\n{{!\nSomething's going on here...\n}}\nEnd.\n");
+  BumpPtrAllocator Allocator;
+  StringSaver Saver(Allocator);
+  MustacheContext Ctx(Allocator, Saver);
+  Template T("Begin.\n{{!\nSomething's going on here...\n}}\nEnd.\n", Ctx);
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -1190,7 +1538,11 @@ TEST(MustacheComments, MultilineStandalone) {
 TEST(MustacheComments, IndentedMultilineStandalone) {
   // All standalone comment lines should be removed.
   Value D = {};
-  Template T("Begin.\n  {{!\n    Something's going on here...\n  }}\nEnd.\n");
+  BumpPtrAllocator Allocator;
+  StringSaver Saver(Allocator);
+  MustacheContext Ctx(Allocator, Saver);
+  Template T("Begin.\n  {{!\n    Something's going on here...\n  }}\nEnd.\n",
+             Ctx);
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -1200,7 +1552,10 @@ TEST(MustacheComments, IndentedMultilineStandalone) {
 TEST(MustacheComments, IndentedInline) {
   // Inline comments should not strip whitespace.
   Value D = {};
-  Template T("  12 {{! 34 }}\n");
+  BumpPtrAllocator Allocator;
+  StringSaver Saver(Allocator);
+  MustacheContext Ctx(Allocator, Saver);
+  Template T("  12 {{! 34 }}\n", Ctx);
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -1210,7 +1565,10 @@ TEST(MustacheComments, IndentedInline) {
 TEST(MustacheComments, SurroundingWhitespace) {
   // Comment removal should preserve surrounding whitespace.
   Value D = {};
-  Template T("12345 {{! Comment Block! }} 67890");
+  BumpPtrAllocator Allocator;
+  StringSaver Saver(Allocator);
+  MustacheContext Ctx(Allocator, Saver);
+  Template T("12345 {{! Comment Block! }} 67890", Ctx);
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -1221,7 +1579,10 @@ TEST(MustacheComments, VariableNameCollision) {
   // Comments must never render, even if a variable with the same name exists.
   Value D = Object{
       {"! comment", 1}, {"! comment ", 2}, {"!comment", 3}, {"comment", 4}};
-  Template T("comments never show: >{{! comment }}<");
+  BumpPtrAllocator Allocator;
+  StringSaver Saver(Allocator);
+  MustacheContext Ctx(Allocator, Saver);
+  Template T("comments never show: >{{! comment }}<", Ctx);
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -1234,7 +1595,10 @@ TEST(MustacheComments, VariableNameCollision) {
 // implemented, these assertions should be changed back to EXPECT_EQ.
 TEST(MustacheTripleMustache, Basic) {
   Value D = Object{{"subject", "<b>World</b>"}};
-  Template T("Hello, {{{subject}}}!");
+  BumpPtrAllocator Allocator;
+  StringSaver Saver(Allocator);
+  MustacheContext Ctx(Allocator, Saver);
+  Template T("Hello, {{{subject}}}!", Ctx);
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -1243,7 +1607,10 @@ TEST(MustacheTripleMustache, Basic) {
 
 TEST(MustacheTripleMustache, IntegerInterpolation) {
   Value D = Object{{"mph", 85}};
-  Template T("{{{mph}}} miles an hour!");
+  BumpPtrAllocator Allocator;
+  StringSaver Saver(Allocator);
+  MustacheContext Ctx(Allocator, Saver);
+  Template T("{{{mph}}} miles an hour!", Ctx);
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -1252,7 +1619,10 @@ TEST(MustacheTripleMustache, IntegerInterpolation) {
 
 TEST(MustacheTripleMustache, DecimalInterpolation) {
   Value D = Object{{"power", 1.21}};
-  Template T("{{{power}}} jiggawatts!");
+  BumpPtrAllocator Allocator;
+  StringSaver Saver(Allocator);
+  MustacheContext Ctx(Allocator, Saver);
+  Template T("{{{power}}} jiggawatts!", Ctx);
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -1261,7 +1631,10 @@ TEST(MustacheTripleMustache, DecimalInterpolation) {
 
 TEST(MustacheTripleMustache, NullInterpolation) {
   Value D = Object{{"cannot", nullptr}};
-  Template T("I ({{{cannot}}}) be seen!");
+  BumpPtrAllocator Allocator;
+  StringSaver Saver(Allocator);
+  MustacheContext Ctx(Allocator, Saver);
+  Template T("I ({{{cannot}}}) be seen!", Ctx);
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -1270,7 +1643,10 @@ TEST(MustacheTripleMustache, NullInterpolation) {
 
 TEST(MustacheTripleMustache, ContextMissInterpolation) {
   Value D = Object{};
-  Template T("I ({{{cannot}}}) be seen!");
+  BumpPtrAllocator Allocator;
+  StringSaver Saver(Allocator);
+  MustacheContext Ctx(Allocator, Saver);
+  Template T("I ({{{cannot}}}) be seen!", Ctx);
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -1279,7 +1655,10 @@ TEST(MustacheTripleMustache, ContextMissInterpolation) {
 
 TEST(MustacheTripleMustache, DottedNames) {
   Value D = Object{{"person", Object{{"name", "<b>Joe</b>"}}}};
-  Template T("{{{person.name}}}");
+  BumpPtrAllocator Allocator;
+  StringSaver Saver(Allocator);
+  MustacheContext Ctx(Allocator, Saver);
+  Template T("{{{person.name}}}", Ctx);
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -1288,7 +1667,10 @@ TEST(MustacheTripleMustache, DottedNames) {
 
 TEST(MustacheTripleMustache, ImplicitIterator) {
   Value D = Object{{"list", Array{"<a>", "<b>"}}};
-  Template T("{{#list}}({{{.}}}){{/list}}");
+  BumpPtrAllocator Allocator;
+  StringSaver Saver(Allocator);
+  MustacheContext Ctx(Allocator, Saver);
+  Template T("{{#list}}({{{.}}}){{/list}}", Ctx);
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -1297,7 +1679,10 @@ TEST(MustacheTripleMustache, ImplicitIterator) {
 
 TEST(MustacheTripleMustache, SurroundingWhitespace) {
   Value D = Object{{"string", "---"}};
-  Template T("| {{{string}}} |");
+  BumpPtrAllocator Allocator;
+  StringSaver Saver(Allocator);
+  MustacheContext Ctx(Allocator, Saver);
+  Template T("| {{{string}}} |", Ctx);
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -1306,7 +1691,10 @@ TEST(MustacheTripleMustache, SurroundingWhitespace) {
 
 TEST(MustacheTripleMustache, Standalone) {
   Value D = Object{{"string", "---"}};
-  Template T("  {{{string}}}\n");
+  BumpPtrAllocator Allocator;
+  StringSaver Saver(Allocator);
+  MustacheContext Ctx(Allocator, Saver);
+  Template T("  {{{string}}}\n", Ctx);
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -1315,7 +1703,10 @@ TEST(MustacheTripleMustache, Standalone) {
 
 TEST(MustacheTripleMustache, WithPadding) {
   Value D = Object{{"string", "---"}};
-  Template T("|{{{ string }}}|");
+  BumpPtrAllocator Allocator;
+  StringSaver Saver(Allocator);
+  MustacheContext Ctx(Allocator, Saver);
+  Template T("|{{{ string }}}|", Ctx);
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -1324,7 +1715,10 @@ TEST(MustacheTripleMustache, WithPadding) {
 
 TEST(MustacheDelimiters, PairBehavior) {
   Value D = Object{{"text", "Hey!"}};
-  Template T("{{=<% %>=}}(<%text%>)");
+  BumpPtrAllocator Allocator;
+  StringSaver Saver(Allocator);
+  MustacheContext Ctx(Allocator, Saver);
+  Template T("{{=<% %>=}}(<%text%>)", Ctx);
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -1333,7 +1727,10 @@ TEST(MustacheDelimiters, PairBehavior) {
 
 TEST(MustacheDelimiters, SpecialCharacters) {
   Value D = Object{{"text", "It worked!"}};
-  Template T("({{=[ ]=}}[text])");
+  BumpPtrAllocator Allocator;
+  StringSaver Saver(Allocator);
+  MustacheContext Ctx(Allocator, Saver);
+  Template T("({{=[ ]=}}[text])", Ctx);
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -1342,9 +1739,12 @@ TEST(MustacheDelimiters, SpecialCharacters) {
 
 TEST(MustacheDelimiters, Sections) {
   Value D = Object{{"section", true}, {"data", "I got interpolated."}};
-  auto T =
-      Template("[\n{{#section}}\n  {{data}}\n  |data|\n{{/section}}\n\n{{= "
-               "| | =}}\n|#section|\n  {{data}}\n  |data|\n|/section|\n]\n");
+  BumpPtrAllocator Allocator;
+  StringSaver Saver(Allocator);
+  MustacheContext Ctx(Allocator, Saver);
+  Template T("[\n{{#section}}\n  {{data}}\n  |data|\n{{/section}}\n\n{{= "
+             "| | =}}\n|#section|\n  {{data}}\n  |data|\n|/section|\n]\n",
+             Ctx);
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -1355,9 +1755,12 @@ TEST(MustacheDelimiters, Sections) {
 
 TEST(MustacheDelimiters, InvertedSections) {
   Value D = Object{{"section", false}, {"data", "I got interpolated."}};
-  auto T =
-      Template("[\n{{^section}}\n  {{data}}\n  |data|\n{{/section}}\n\n{{= "
-               "| | =}}\n|^section|\n  {{data}}\n  |data|\n|/section|\n]\n");
+  BumpPtrAllocator Allocator;
+  StringSaver Saver(Allocator);
+  MustacheContext Ctx(Allocator, Saver);
+  Template T("[\n{{^section}}\n  {{data}}\n  |data|\n{{/section}}\n\n{{= "
+             "| | =}}\n|^section|\n  {{data}}\n  |data|\n|/section|\n]\n",
+             Ctx);
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -1368,7 +1771,10 @@ TEST(MustacheDelimiters, InvertedSections) {
 
 TEST(MustacheDelimiters, PartialInheritence) {
   Value D = Object{{"value", "yes"}};
-  Template T("[ {{>include}} ]\n{{= | | =}}\n[ |>include| ]\n");
+  BumpPtrAllocator Allocator;
+  StringSaver Saver(Allocator);
+  MustacheContext Ctx(Allocator, Saver);
+  Template T("[ {{>include}} ]\n{{= | | =}}\n[ |>include| ]\n", Ctx);
   T.registerPartial("include", ".{{value}}.");
   std::string Out;
   raw_string_ostream OS(Out);
@@ -1378,7 +1784,10 @@ TEST(MustacheDelimiters, PartialInheritence) {
 
 TEST(MustacheDelimiters, PostPartialBehavior) {
   Value D = Object{{"value", "yes"}};
-  Template T("[ {{>include}} ]\n[ .{{value}}.  .|value|. ]\n");
+  BumpPtrAllocator Allocator;
+  StringSaver Saver(Allocator);
+  MustacheContext Ctx(Allocator, Saver);
+  Template T("[ {{>include}} ]\n[ .{{value}}.  .|value|. ]\n", Ctx);
   T.registerPartial("include", ".{{value}}. {{= | | =}} .|value|.");
   std::string Out;
   raw_string_ostream OS(Out);
@@ -1388,7 +1797,10 @@ TEST(MustacheDelimiters, PostPartialBehavior) {
 
 TEST(MustacheDelimiters, SurroundingWhitespace) {
   Value D = Object{};
-  Template T("| {{=@ @=}} |");
+  BumpPtrAllocator Allocator;
+  StringSaver Saver(Allocator);
+  MustacheContext Ctx(Allocator, Saver);
+  Template T("| {{=@ @=}} |", Ctx);
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -1397,7 +1809,10 @@ TEST(MustacheDelimiters, SurroundingWhitespace) {
 
 TEST(MustacheDelimiters, OutlyingWhitespaceInline) {
   Value D = Object{};
-  Template T(" | {{=@ @=}}\n");
+  BumpPtrAllocator Allocator;
+  StringSaver Saver(Allocator);
+  MustacheContext Ctx(Allocator, Saver);
+  Template T(" | {{=@ @=}}\n", Ctx);
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -1406,7 +1821,10 @@ TEST(MustacheDelimiters, OutlyingWhitespaceInline) {
 
 TEST(MustacheDelimiters, StandaloneTag) {
   Value D = Object{};
-  Template T("Begin.\n{{=@ @=}}\nEnd.\n");
+  BumpPtrAllocator Allocator;
+  StringSaver Saver(Allocator);
+  MustacheContext Ctx(Allocator, Saver);
+  Template T("Begin.\n{{=@ @=}}\nEnd.\n", Ctx);
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -1415,7 +1833,10 @@ TEST(MustacheDelimiters, StandaloneTag) {
 
 TEST(MustacheDelimiters, IndentedStandaloneTag) {
   Value D = Object{};
-  Template T("Begin.\n  {{=@ @=}}\nEnd.\n");
+  BumpPtrAllocator Allocator;
+  StringSaver Saver(Allocator);
+  MustacheContext Ctx(Allocator, Saver);
+  Template T("Begin.\n  {{=@ @=}}\nEnd.\n", Ctx);
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -1424,7 +1845,10 @@ TEST(MustacheDelimiters, IndentedStandaloneTag) {
 
 TEST(MustacheDelimiters, StandaloneLineEndings) {
   Value D = Object{};
-  Template T("|\r\n{{= @ @ =}}\r\n|");
+  BumpPtrAllocator Allocator;
+  StringSaver Saver(Allocator);
+  MustacheContext Ctx(Allocator, Saver);
+  Template T("|\r\n{{= @ @ =}}\r\n|", Ctx);
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -1433,7 +1857,10 @@ TEST(MustacheDelimiters, StandaloneLineEndings) {
 
 TEST(MustacheDelimiters, StandaloneWithoutPreviousLine) {
   Value D = Object{};
-  Template T("  {{=@ @=}}\n=");
+  BumpPtrAllocator Allocator;
+  StringSaver Saver(Allocator);
+  MustacheContext Ctx(Allocator, Saver);
+  Template T("  {{=@ @=}}\n=", Ctx);
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -1442,7 +1869,10 @@ TEST(MustacheDelimiters, StandaloneWithoutPreviousLine) {
 
 TEST(MustacheDelimiters, StandaloneWithoutNewline) {
   Value D = Object{};
-  Template T("=\n  {{=@ @=}}");
+  BumpPtrAllocator Allocator;
+  StringSaver Saver(Allocator);
+  MustacheContext Ctx(Allocator, Saver);
+  Template T("=\n  {{=@ @=}}", Ctx);
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -1451,7 +1881,10 @@ TEST(MustacheDelimiters, StandaloneWithoutNewline) {
 
 TEST(MustacheDelimiters, PairwithPadding) {
   Value D = Object{};
-  Template T("|{{= @   @ =}}|");
+  BumpPtrAllocator Allocator;
+  StringSaver Saver(Allocator);
+  MustacheContext Ctx(Allocator, Saver);
+  Template T("|{{= @   @ =}}|", Ctx);
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
diff --git a/llvm/utils/gn/secondary/clang/lib/ASTMatchers/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/ASTMatchers/BUILD.gn
index 63bf726..8fe30b8 100644
--- a/llvm/utils/gn/secondary/clang/lib/ASTMatchers/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/lib/ASTMatchers/BUILD.gn
@@ -9,7 +9,6 @@ static_library("ASTMatchers") {
   sources = [
     "ASTMatchFinder.cpp",
     "ASTMatchersInternal.cpp",
-    "GtestMatchers.cpp",
     "LowLevelHelpers.cpp",
   ]
 }
diff --git a/llvm/utils/gn/secondary/clang/unittests/ASTMatchers/BUILD.gn b/llvm/utils/gn/secondary/clang/unittests/ASTMatchers/BUILD.gn
index 10f540b..56d3484 100644
--- a/llvm/utils/gn/secondary/clang/unittests/ASTMatchers/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/unittests/ASTMatchers/BUILD.gn
@@ -17,6 +17,5 @@ unittest("ASTMatchersTests") {
     "ASTMatchersNarrowingTest.cpp",
     "ASTMatchersNodeTest.cpp",
     "ASTMatchersTraversalTest.cpp",
-    "GtestMatchersTest.cpp",
   ]
 }
diff --git a/llvm/utils/llvm-test-mustache-spec/llvm-test-mustache-spec.cpp b/llvm/utils/llvm-test-mustache-spec/llvm-test-mustache-spec.cpp
index 9007eb3..93e2efe 100644
--- a/llvm/utils/llvm-test-mustache-spec/llvm-test-mustache-spec.cpp
+++ b/llvm/utils/llvm-test-mustache-spec/llvm-test-mustache-spec.cpp
@@ -212,7 +212,10 @@ static void runTest(StringRef InputFile) {
   for (Value V : *TestArray) {
     auto TestData =
         ExitOnErr(TestData::createTestData(V.getAsObject(), InputFile));
-    Template T(TestData.TemplateStr);
+    BumpPtrAllocator Allocator;
+    StringSaver Saver(Allocator);
+    MustacheContext Ctx(Allocator, Saver);
+    Template T(TestData.TemplateStr, Ctx);
     registerPartials(TestData.Partials, T);
 
     std::string ActualStr;
diff --git a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
index db1b7e3..6925cec 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
@@ -292,18 +292,74 @@ def ROCDL_BarrierOp : ROCDL_Op<"barrier"> {
   let assemblyFormat = "attr-dict";
 }
 
+def ROCDLBufferLDS : LLVM_PointerInAddressSpace<3>;
+
+def ROCDL_BarrierInitOp : ROCDL_IntrOp<"s.barrier.init", [], [], [], 0, 0, 0, 0, [1], ["id"]>,
+  Arguments<(ins Arg<ROCDLBufferLDS, "", []>:$ptr, I32Attr:$id)> {
+  let description = [{
+    Available on gfx1250+.
+  }];
+  let results = (outs);
+  let assemblyFormat = "$ptr `,` $id attr-dict";
+}
+
 def ROCDL_BarrierSignalOp : ROCDL_ConcreteNonMemIntrOp<"s.barrier.signal", [], 0, [0], ["id"]>,
   Arguments<(ins I32Attr:$id)> {
   let results = (outs);
   let assemblyFormat = "$id attr-dict";
 }
 
+def ROCDL_BarrierSignalVarOp : ROCDL_IntrOp<"s.barrier.signal.var", [], [], [], 0, 0, 0, 0, [1], ["id"]>,
+  Arguments<(ins Arg<ROCDLBufferLDS, "", []>:$ptr, I32Attr:$id)> {
+  let description = [{
+    Available on gfx1250+.
+  }];
+  let results = (outs);
+  let assemblyFormat = "$ptr `,` $id attr-dict";
+}
+
+def ROCDL_BarrierJoinOp : ROCDL_IntrOp<"s.barrier.join", [], [], [], 0>,
+  Arguments<(ins Arg<ROCDLBufferLDS, "", []>:$ptr)> {
+  let description = [{
+    Available on gfx1250+.
+  }];
+  let results = (outs);
+  let assemblyFormat = "$ptr attr-dict";
+}
+
+def ROCDL_BarrierLeaveOp : ROCDL_ConcreteNonMemIntrOp<"s.barrier.leave", [], 0, [0], ["id"]>,
+  Arguments<(ins I16Attr:$id)> {
+  let description = [{
+    Available on gfx1250+.
+  }];
+  let results = (outs);
+  let assemblyFormat = "$id attr-dict";
+}
+
 def ROCDL_BarrierWaitOp : ROCDL_ConcreteNonMemIntrOp<"s.barrier.wait", [], 0, [0], ["id"]>,
   Arguments<(ins I16Attr:$id)> {
   let results = (outs);
   let assemblyFormat = "$id attr-dict";
 }
 
+def ROCDL_BarrierSignalIsfirstOp : ROCDL_ConcreteNonMemIntrOp<"s.barrier.signal.isfirst", [], 1, [0], ["id"]>,
+  Arguments<(ins I32Attr:$id)> {
+  let description = [{
+    Available on gfx1250+.
+  }];
+  let results = (outs I1:$res);
+  let assemblyFormat = "$id attr-dict `:` type($res)";
+}
+
+def ROCDL_GetBarrierStateOp : ROCDL_ConcreteNonMemIntrOp<"s.get.barrier.state", [], 1, [0], ["id"]>,
+  Arguments<(ins I32Attr:$id)> {
+  let description = [{
+    Available on gfx1250+.
+  }];
+  let results = (outs I32:$res);
+  let assemblyFormat = "$id attr-dict `:` type($res)";
+}
+
 def ROCDL_WaitDscntOp: ROCDL_ConcreteNonMemIntrOp<"s.wait.dscnt", [], 0, [0], ["count"]>,
   Arguments<(ins I16Attr:$count)> {
   let summary = "Wait until DSCNT is less than or equal to `count`";
@@ -497,7 +553,6 @@ def ROCDL_wmma_i32_16x16x32_iu4 : ROCDL_Wmma_IntrOp<"wmma.i32.16x16x32.iu4", [1]
 // LDS transpose intrinsics (available in GFX950)
 
 def ROCDLGlobalBuffer : LLVM_PointerInAddressSpace<1>;
-def ROCDLBufferLDS : LLVM_PointerInAddressSpace<3>;
 
 class ROCDL_LDS_Read_Tr_IntrOp<string mnemonic> :
   ROCDL_IntrOp<mnemonic, [1], [], [], 1, 0, 1> {
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
index 4919d9a..9d62491 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
@@ -524,6 +524,40 @@ VectorizationState::maskOperation(RewriterBase &rewriter, Operation *opToMask,
 
   if (!mask) {
     LDBG() << "No mask required";
+    if (assumeDynamicDimsMatchVecSizes) {
+      llvm::TypeSwitch<Operation *>(opToMask)
+          .Case<vector::TransferReadOp, vector::TransferWriteOp>(
+              [&](auto xferOp) {
+                // For vector.transfer_read and vector.transfer_write, there is
+                // also the `in-bounds` attribute that has to be set explicitly
+                // to true. Otherwise, "out-of-bounds" access will be assumed
+                // and masks will be generated while lowering these.
+                LDBG() << "Assuming dynamic dimensions match vector sizes and "
+                          "setting their in-bounds to true!";
+                SmallVector<bool> inBoundsMap = xferOp.getInBoundsValues();
+                ShapedType xferType = xferOp.getShapedType();
+                AffineMap permMap = xferOp.getPermutationMap();
+                // Only set the in-bounds values to true for dynamic dims.
+                // Different mechanisms will set these accordingly for the
+                // static dims.
+                for (unsigned i = 0; i < xferOp.getTransferRank(); i++) {
+                  auto dimExpr = dyn_cast<AffineDimExpr>(permMap.getResult(i));
+                  // Skip broadcast dimensions.
+                  if (!dimExpr)
+                    continue;
+                  unsigned pos = dimExpr.getPosition();
+                  if (xferType.isDynamicDim(pos))
+                    inBoundsMap[i] = true;
+                }
+                rewriter.modifyOpInPlace(xferOp, [&]() {
+                  xferOp.setInBoundsAttr(
+                      rewriter.getBoolArrayAttr(inBoundsMap));
+                });
+              })
+          .Default([](Operation *op) {
+            // No-op if the operation is not an xfer read or write.
+          });
+    }
     return opToMask;
   }
 
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
index 784e5d6..c28d2fc 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
@@ -720,7 +720,7 @@ struct WgToSgArithConstantOp : public OpConversionPattern<arith::ConstantOp> {
                   ConversionPatternRewriter &rewriter) const override {
     auto vecAttr = dyn_cast<DenseElementsAttr>(op.getValue());
     auto vecType = dyn_cast<VectorType>(op.getType());
-    if (!vecAttr || !vecAttr.isSplat() || !vecType)
+    if (!vecAttr || !vecType)
       return failure();
 
     xegpu::DistributeLayoutAttr layout =
@@ -733,22 +733,139 @@ struct WgToSgArithConstantOp : public OpConversionPattern<arith::ConstantOp> {
     int count;
     std::tie(sgShape, count) = getSgShapeAndCount(wgShape, layout);
 
-    // Current limitation: constant of vector with single value.
-    // TODO: support more complex cases, e.g., vector with multiple values.
-    Attribute singleVal = vecAttr.getSplatValue<Attribute>();
-
     auto newType = VectorType::get(sgShape, vecType.getElementType());
-    auto sgAttr = DenseElementsAttr::get(newType, singleVal);
-    auto cstOp =
-        arith::ConstantOp::create(rewriter, op.getLoc(), newType, sgAttr);
-    if (!layout.getEffectiveLaneLayoutAsInt().empty() ||
-        !layout.getEffectiveInstDataAsInt().empty())
-      xegpu::setDistributeLayoutAttr(cstOp->getResult(0),
-                                     layout.dropSgLayoutAndData());
-    SmallVector<Value> newConsts(count, cstOp);
+    Location loc = op.getLoc();
+    auto eltType = vecType.getElementType();
 
-    rewriter.replaceOpWithMultiple(op, {newConsts});
-    return success();
+    auto setLayoutIfNeeded = [&](Value val) {
+      if (!layout.getEffectiveLaneLayoutAsInt().empty() ||
+          !layout.getEffectiveInstDataAsInt().empty()) {
+        xegpu::setDistributeLayoutAttr(llvm::dyn_cast<OpResult>(val),
+                                       layout.dropSgLayoutAndData());
+      }
+    };
+
+    if (vecAttr.isSplat()) {
+      // Splat: single value for all subgroups
+      Attribute singleVal = vecAttr.getSplatValue<Attribute>();
+      auto sgAttr = DenseElementsAttr::get(newType, singleVal);
+      auto cstOp = arith::ConstantOp::create(rewriter, loc, newType, sgAttr);
+      setLayoutIfNeeded(cstOp->getResult(0));
+      rewriter.replaceOp(op, cstOp);
+      return success();
+    } else if (sgShape == wgShape) { // if the entire vector is shared by all
+                                     // subgroups, don't distribute
+      auto newConstOp =
+          arith::ConstantOp::create(rewriter, op.getLoc(), vecType, vecAttr);
+      setLayoutIfNeeded(newConstOp->getResult(0));
+      rewriter.replaceOp(op, newConstOp);
+      return success();
+    } else {
+      // Non-splat constant
+      // Only supports 1D & 2D
+      // TODO: support other cases that require SLM access
+      if (!eltType.isIndex())
+        return rewriter.notifyMatchFailure(
+            op, "Unsupported element type for non-splat constant op.");
+
+      if (wgShape.size() > 2)
+        return rewriter.notifyMatchFailure(
+            op, "Only 1D & 2D vector constant supported");
+
+      SmallVector<Attribute> values(vecAttr.getValues<Attribute>());
+      int64_t rowStride = 0, colStride = 0;
+      int64_t rows = wgShape.size() == 1 ? 1 : wgShape[0];
+      int64_t cols = wgShape.size() == 1 ? wgShape[0] : wgShape[1];
+
+      // Compute colStride and rowStride, and check for constant strides.
+      if (cols > 1) {
+        colStride = cast<IntegerAttr>(values[1]).getInt() -
+                    cast<IntegerAttr>(values[0]).getInt();
+      }
+      if (rows > 1) {
+        rowStride = cast<IntegerAttr>(values[cols]).getInt() -
+                    cast<IntegerAttr>(values[0]).getInt();
+      }
+
+      for (int64_t r = 0; r < rows; ++r) {
+        for (int64_t c = 0; c < cols; ++c) {
+          int64_t idx = r * cols + c;
+          // Check column stride
+          if (c > 0 && cols > 1) {
+            int64_t prevIdx = r * cols + (c - 1);
+            int64_t diff = cast<IntegerAttr>(values[idx]).getInt() -
+                           cast<IntegerAttr>(values[prevIdx]).getInt();
+            if (diff != colStride)
+              return rewriter.notifyMatchFailure(
+                  op, "Non-constant column stride in constant op.");
+          }
+          // Check row stride
+          if (r > 0 && rows > 1) {
+            int64_t prevIdx = (r - 1) * cols + c;
+            int64_t diff = cast<IntegerAttr>(values[idx]).getInt() -
+                           cast<IntegerAttr>(values[prevIdx]).getInt();
+            if (diff != rowStride)
+              return rewriter.notifyMatchFailure(
+                  op, "Non-constant row stride in constant op.");
+          }
+        }
+      }
+
+      // Create a constant for the base tile.
+      // For 2D case, extract the top-left sgShape[0] x sgShape[1] submatrix.
+      // For 1D case, extract the first sgShape[0] elements.
+      SmallVector<Attribute> baseTileValues;
+      int baseTileCols = sgShape[sgShape.size() - 1];
+      int64_t baseTileRows = sgShape.size() == 1 ? 1 : sgShape[0];
+      for (int64_t r = 0; r < baseTileRows; ++r) {
+        for (int64_t c = 0; c < baseTileCols; ++c) {
+          baseTileValues.push_back(values[r * cols + c]);
+        }
+      }
+
+      auto tileAttr = DenseElementsAttr::get(VectorType::get(sgShape, eltType),
+                                             baseTileValues);
+      auto baseConstVec = rewriter.create<arith::ConstantOp>(loc, tileAttr);
+
+      // Get subgroup id
+      Value sgId =
+          gpu::SubgroupIdOp::create(rewriter, loc, /*upper_bound=*/nullptr);
+
+      auto sgOffsets = layout.getOffsets(rewriter, loc, sgId, wgShape);
+      if (failed(sgOffsets))
+        return failure();
+
+      SmallVector<Value, 2> strideConsts;
+      strideConsts.push_back(
+          rewriter.create<arith::ConstantIndexOp>(loc, colStride));
+      if (rows > 1)
+        strideConsts.insert(
+            strideConsts.begin(),
+            rewriter.create<arith::ConstantIndexOp>(loc, rowStride));
+
+      SmallVector<Value> newConstOps;
+      for (auto offsets : *sgOffsets) {
+        // Multiply offset with stride, broadcast it and add to baseConstVec
+        Value mulOffset = rewriter.create<arith::ConstantIndexOp>(loc, 0);
+        for (size_t i = 0; i < strideConsts.size(); ++i) {
+          Value mul = rewriter.create<arith::MulIOp>(
+              loc, rewriter.getIndexType(), offsets[i], strideConsts[i]);
+          mulOffset = rewriter.create<arith::AddIOp>(
+              loc, rewriter.getIndexType(), mulOffset, mul);
+        }
+        // Broadcast to baseConstVec size
+        auto bcastOffset = rewriter.create<vector::BroadcastOp>(
+            loc, baseConstVec.getType(), mulOffset);
+        auto finalConst =
+            arith::AddIOp::create(rewriter, loc, baseConstVec, bcastOffset);
+        setLayoutIfNeeded(baseConstVec);
+        setLayoutIfNeeded(bcastOffset);
+        setLayoutIfNeeded(finalConst);
+        newConstOps.push_back(finalConst);
+      }
+      rewriter.replaceOpWithMultiple(op, {newConstOps});
+      return success();
+    }
   }
 };
 
diff --git a/mlir/test/Dialect/LLVMIR/rocdl.mlir b/mlir/test/Dialect/LLVMIR/rocdl.mlir
index a88b59a..358bd33 100644
--- a/mlir/test/Dialect/LLVMIR/rocdl.mlir
+++ b/mlir/test/Dialect/LLVMIR/rocdl.mlir
@@ -951,6 +951,13 @@ llvm.func @rocdl.s.barrier() {
   llvm.return
 }
 
+llvm.func @rocdl.s.barrier.init(%ptr : !llvm.ptr<3>) {
+  // CHECK-LABEL: rocdl.s.barrier.init
+  // CHECK: rocdl.s.barrier.init %[[PTR:.+]], 1
+  rocdl.s.barrier.init %ptr, 1
+  llvm.return
+}
+
 llvm.func @rocdl.s.barrier.signal() {
   // CHECK-LABEL: rocdl.s.barrier.signal
   // CHECK: rocdl.s.barrier.signal -1
@@ -958,6 +965,27 @@ llvm.func @rocdl.s.barrier.signal() {
   llvm.return
 }
 
+llvm.func @rocdl.s.barrier.signal.var(%ptr : !llvm.ptr<3>) {
+  // CHECK-LABEL: rocdl.s.barrier.signal.var
+  // CHECK: rocdl.s.barrier.signal.var %[[PTR:.+]], 1
+  rocdl.s.barrier.signal.var %ptr, 1
+  llvm.return
+}
+
+llvm.func @rocdl.s.barrier.join(%ptr : !llvm.ptr<3>) {
+  // CHECK-LABEL: rocdl.s.barrier.join
+  // CHECK: rocdl.s.barrier.join %[[PTR:.+]]
+  rocdl.s.barrier.join %ptr
+  llvm.return
+}
+
+llvm.func @rocdl.s.barrier.leave() {
+  // CHECK-LABEL: rocdl.s.barrier.leave
+  // CHECK: rocdl.s.barrier.leave 1
+  rocdl.s.barrier.leave 1
+  llvm.return
+}
+
 llvm.func @rocdl.s.barrier.wait() {
   // CHECK-LABEL: rocdl.s.barrier.wait
   // CHECK: rocdl.s.barrier.wait -1
@@ -965,6 +993,20 @@ llvm.func @rocdl.s.barrier.wait() {
   llvm.return
 }
 
+llvm.func @rocdl.s.barrier.signal.isfirst() {
+  // CHECK-LABEL: rocdl.s.barrier.signal.isfirst
+  // CHECK: rocdl.s.barrier.signal.isfirst 1
+  %0 = rocdl.s.barrier.signal.isfirst 1 : i1
+  llvm.return
+}
+
+llvm.func @rocdl.s.get.barrier.state() {
+  // CHECK-LABEL: rocdl.s.get.barrier.state
+  // CHECK: rocdl.s.get.barrier.state 1
+  %0 = rocdl.s.get.barrier.state 1 : i32
+  llvm.return
+}
+
 llvm.func @rocdl.s.wait.dscnt() {
   // CHECK-LABEL: rocdl.s.wait.dscnt
   // CHECK: rocdl.s.wait.dscnt 0
diff --git a/mlir/test/Dialect/Linalg/vectorization/linalg-ops.mlir b/mlir/test/Dialect/Linalg/vectorization/linalg-ops.mlir
index 62bf1f5..11bea8d 100644
--- a/mlir/test/Dialect/Linalg/vectorization/linalg-ops.mlir
+++ b/mlir/test/Dialect/Linalg/vectorization/linalg-ops.mlir
@@ -918,12 +918,17 @@ func.func @mmt4d_scalable_with_assume(%A: memref<16x16x8x1xf32>, %B: memref<16x1
 // CHECK-SAME:      %[[B:.*]]: memref<16x16x?x1xf32>,
 // CHECK-SAME:      %[[C_IN:.*]]: memref<16x16x8x?xf32>) {
 // CHECK-NOT:       mask
-// CHECK:           %[[VEC_A:.*]] = vector.transfer_read %[[A]]{{.*}} : memref<16x16x8x1xf32>, vector<16x16x16x8x[4]x1xf32>
-// CHECK:           %[[VEC_B:.*]] = vector.transfer_read %[[B]]{{.*}} : memref<16x16x?x1xf32>, vector<16x16x16x8x[4]x1xf32>
-// CHECK:           %[[VEC_C:.*]] = vector.transfer_read %[[C_IN]]{{.*}} : memref<16x16x8x?xf32>, vector<16x16x8x[4]xf32>
+// CHECK:           %[[VEC_A:.*]] = vector.transfer_read %[[A]]
+// CHECK-SAME:      memref<16x16x8x1xf32>, vector<16x16x16x8x[4]x1xf32>
+// CHECK:           %[[VEC_B:.*]] = vector.transfer_read %[[B]]
+// `in-bounds` are set to true for dynamic dims with assume, static sizes will be inferred elsewhere.
+// CHECK-SAME:      in_bounds = [false, false, false, false, true, false]{{.*}} : memref<16x16x?x1xf32>, vector<16x16x16x8x[4]x1xf32>
+// CHECK:           %[[VEC_C:.*]] = vector.transfer_read %[[C_IN]]
+// CHECK-SAME:      in_bounds = [false, false, false, true]{{.*}} : memref<16x16x8x?xf32>, vector<16x16x8x[4]xf32>
 // CHECK:           %[[MUL:.*]] = arith.mulf %[[VEC_A]], %[[VEC_B]] : vector<16x16x16x8x[4]x1xf32>
 // CHECK:           %[[RED:.*]] = vector.multi_reduction <add>, %[[MUL]], %[[VEC_C]] [2, 5] : vector<16x16x16x8x[4]x1xf32> to vector<16x16x8x[4]xf32>
-// CHECK:           vector.transfer_write %[[RED]], %[[C_IN]]{{.*}} : vector<16x16x8x[4]xf32>, memref<16x16x8x?xf32>
+// CHECK:           vector.transfer_write %[[RED]], %[[C_IN]]
+// CHECK-SAME:      in_bounds = [false, false, false, true]{{.*}} : vector<16x16x8x[4]xf32>, memref<16x16x8x?xf32>
 
 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
@@ -1011,12 +1016,17 @@ func.func @batch_mmt4d_scalable_with_assume(%A: memref<2x16x16x8x1xf32>, %B: mem
 // CHECK-SAME:      %[[B:.*]]: memref<2x16x16x?x1xf32>,
 // CHECK-SAME:      %[[C_IN:.*]]: memref<2x16x16x8x?xf32>) {
 // CHECK-NOT:       mask
-// CHECK:           %[[VEC_A:.*]] = vector.transfer_read %[[A]]{{.*}} : memref<2x16x16x8x1xf32>, vector<2x16x16x16x8x[4]x1xf32>
-// CHECK:           %[[VEC_B:.*]] = vector.transfer_read %[[B]]{{.*}} : memref<2x16x16x?x1xf32>, vector<2x16x16x16x8x[4]x1xf32>
-// CHECK:           %[[VEC_C:.*]] = vector.transfer_read %[[C_IN]]{{.*}} : memref<2x16x16x8x?xf32>, vector<2x16x16x8x[4]xf32>
+// CHECK:           %[[VEC_A:.*]] = vector.transfer_read %[[A]]
+// CHECK-SAME:      memref<2x16x16x8x1xf32>, vector<2x16x16x16x8x[4]x1xf32>
+// CHECK:           %[[VEC_B:.*]] = vector.transfer_read %[[B]]
+// `in-bounds` are set to true for dynamic dims with assume, static sizes will be inferred elsewhere.
+// CHECK-SAME:      in_bounds = [false, false, false, false, false, true, false]{{.*}} : memref<2x16x16x?x1xf32>, vector<2x16x16x16x8x[4]x1xf32>
+// CHECK:           %[[VEC_C:.*]] = vector.transfer_read %[[C_IN]]
+// CHECK-SAME:      in_bounds = [false, false, false, false, true]{{.*}} : memref<2x16x16x8x?xf32>, vector<2x16x16x8x[4]xf32>
 // CHECK:           %[[MUL:.*]] = arith.mulf %[[VEC_A]], %[[VEC_B]] : vector<2x16x16x16x8x[4]x1xf32>
 // CHECK:           %[[RED:.*]] = vector.multi_reduction <add>, %[[MUL]], %[[VEC_C]] [3, 6] : vector<2x16x16x16x8x[4]x1xf32> to vector<2x16x16x8x[4]xf32>
-// CHECK:           vector.transfer_write %[[RED]], %[[C_IN]]{{.*}} : vector<2x16x16x8x[4]xf32>, memref<2x16x16x8x?xf32>
+// CHECK:           vector.transfer_write %[[RED]], %[[C_IN]]
+// CHECK-SAME:      in_bounds = [false, false, false, false, true]{{.*}} : vector<2x16x16x8x[4]xf32>, memref<2x16x16x8x?xf32>
 
 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops-rr.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops-rr.mlir
index dce73de..86a021b 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops-rr.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops-rr.mlir
@@ -98,4 +98,31 @@ gpu.module @test_distribution {
       : vector<256x64xf32> to vector<256xf32>
     gpu.return
   }
+
+  gpu.func @non_splat_constant() {
+    // CHECK-DAG: %[[BASECST:.*]] = arith.constant dense<{{.*}}> : vector<2x1xindex>
+    // CHECK-DAG: %[[SGID:.*]] = gpu.subgroup_id : index
+    // CHECK-DAG: %[[MAP4:.*]] = affine.apply #map4()[%[[SGID]]]
+    // CHECK-DAG: %[[MAP5:.*]] = affine.apply #map5()[%[[SGID]]]
+    // CHECK-DAG: %[[MUL:.*]] = index.mul %[[MAP4]], %[[C2:.*]]
+    // CHECK-DAG: %[[REMU1:.*]] = index.remu %[[MUL]], %[[C32:.*]]
+    // CHECK-DAG: %[[REMU2:.*]] = index.remu %[[MAP5]], %[[C1:.*]]
+    // CHECK-DAG: %[[ADD16:.*]] = arith.addi %[[MUL]], %[[C16:.*]] : index
+    // CHECK-DAG: %[[REMU3:.*]] = index.remu %[[ADD16]], %[[C32:.*]]
+    // CHECK-DAG: %[[REMU4:.*]] = index.remu %[[MAP5]], %[[C1:.*]]
+    // CHECK-DAG: %[[STRIDE1:.*]] = arith.muli %[[REMU1]], %[[C16:.*]] : index
+    // CHECK-DAG: %[[ADDSTRIDES:.*]] = arith.addi %[[C0:.*]], %[[STRIDE1]] : index
+    // CHECK-DAG: %[[STRIDE2:.*]] = arith.muli %[[REMU2]], %[[C0:.*]] : index
+    // CHECK-DAG: %[[ADDSTRIDES1:.*]] = arith.addi %[[ADDSTRIDES]], %[[STRIDE2]] : index
+    // CHECK-DAG: %[[BCAST1:.*]] = vector.broadcast %[[ADDSTRIDES1]] : index to vector<2x1xindex>
+    // CHECK-DAG: %[[RESULT1:.*]] = arith.addi %[[BASECST]], %[[BCAST1]] : vector<2x1xindex>
+    // CHECK-DAG: %[[STRIDE3:.*]] = arith.muli %[[REMU3]], %[[C16:.*]] : index
+    // CHECK-DAG: %[[ADDSTRIDES2:.*]] = arith.addi %[[C0:.*]], %[[STRIDE3]] : index
+    // CHECK-DAG: %[[STRIDE4:.*]] = arith.muli %[[REMU4]], %[[C0:.*]] : index
+    // CHECK-DAG: %[[ADDSTRIDES3:.*]] = arith.addi %[[ADDSTRIDES2]], %[[STRIDE4]] : index
+    // CHECK-DAG: %[[BCAST2:.*]] = vector.broadcast %[[ADDSTRIDES3]] : index to vector<2x1xindex>
+    // CHECK-DAG: %[[RESULT2:.*]] = arith.addi %[[BASECST]], %[[BCAST2]] : vector<2x1xindex>
+    %cst_2 = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [8, 1], sg_data = [2, 1]>} dense<[[0], [16], [32], [48], [64], [80], [96], [112], [128], [144], [160], [176], [192], [208], [224], [240], [256], [272], [288], [304], [320], [336], [352], [368], [384], [400], [416], [432], [448], [464], [480], [496]]> : vector<32x1xindex>
+    gpu.return
+  }
 }
diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir
index 38392fd..742d11f 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir
@@ -463,4 +463,68 @@ gpu.module @test_distribution {
     %broadcast = vector.broadcast %muli {layout_result_0 = #xegpu.layout<sg_layout = [4, 2, 6, 1], sg_data = [1, 1, 1, 32]>} : index to vector<4x2x6x32xindex>
     gpu.return
   }
+
+  // CHECK-LABEL: non_splat_constant_2D
+  gpu.func @non_splat_constant_2D() {
+    // CHECK-DAG: %[[CST:.*]] = arith.constant dense<0> : vector<1x1xindex>
+    // CHECK-DAG: %[[SGID:.*]] = gpu.subgroup_id : index
+    // CHECK-DAG: affine.apply #map4()[%[[SGID]]]
+    // CHECK-DAG: affine.apply #map5()[%[[SGID]]]
+    // CHECK-DAG: %[[IDY:.*]] = index.remu %{{.*}}, %[[C32:.*]]
+    // CHECK-DAG: %[[IDX:.*]] = index.remu %{{.*}}, %[[C1:.*]]
+    // CHECK-DAG: %[[STRIDECOL:.*]] = arith.muli %[[IDY]], %[[C16:.*]] : index
+    // CHECK-DAG: %[[ADD:.*]] = arith.addi %[[C0:.*]], %[[STRIDECOL]] : index
+    // CHECK-DAG: %[[STRIDEROW:.*]] = arith.muli %[[IDX]], %[[C0:.*]] : index
+    // CHECK-DAG: %[[ADDSTRIDES:.*]] = arith.addi %[[ADD]], %[[STRIDEROW]] : index
+    // CHECK-DAG: %[[BCAST:.*]] = vector.broadcast %[[ADDSTRIDES]] : index to vector<1x1xindex>
+    // CHECK-DAG: arith.addi %[[CST]], %[[BCAST]] : vector<1x1xindex>
+    %cst = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [32, 1], sg_data = [1, 1]>} dense<[[0], [16], [32], [48], [64], [80], [96], [112], [128], [144], [160], [176], [192], [208], [224], [240], [256], [272], [288], [304], [320], [336], [352], [368], [384], [400], [416], [432], [448], [464], [480], [496]]> : vector<32x1xindex>
+    gpu.return
+  }
+
+  // CHECK-LABEL: non_splat_constant_2D_non_unit_dim
+  gpu.func @non_splat_constant_2D_non_unit_dim() {
+    // CHECK-DAG: %[[BASECST:.*]] = arith.constant dense<{{.*}} : vector<2x2xindex>
+    // CHECK-DAG: %[[SGID:.*]] = gpu.subgroup_id : index
+    // CHECK-DAG: %[[IDY:.*]] = affine.apply #map()[%[[SGID]]]
+    // CHECK-DAG: %[[IDX:.*]] = affine.apply #map1()[%[[SGID]]]
+    // CHECK-DAG: %[[MULY:.*]] = index.mul %[[IDY]], %[[C2:.*]]
+    // CHECK-DAG: %[[C2_2:.*]] = arith.constant 2 : index
+    // CHECK-DAG: %[[MULX:.*]] = index.mul %[[IDX]], %[[C2:.*]]
+    // CHECK-DAG: %[[REMU_Y:.*]] = index.remu %[[MULY]], %[[C8:.*]]
+    // CHECK-DAG: %[[C8_2:.*]] = arith.constant 8 : index
+    // CHECK-DAG: %[[REMU_X:.*]] = index.remu %[[MULX]], %[[C8:.*]]
+    // CHECK-DAG: %[[MUL5:.*]] = arith.muli %[[REMU_Y]], %[[C8:.*]] : index
+    // CHECK-DAG: %[[ADD:.*]] = arith.addi %[[C0:.*]], %[[MUL5]] : index
+    // CHECK-DAG: %[[MUL6:.*]] = arith.muli %[[REMU_X]], %[[C16:.*]] : index
+    // CHECK-DAG: %[[ADDSTRIDES:.*]] = arith.addi  %[[ADD]], %[[MUL6]] : index
+    // CHECK-DAG: %[[BCAST:.*]] = vector.broadcast %[[ADDSTRIDES]] : index to vector<2x2xindex>
+    // CHECK-DAG: %[[ADDCST:.*]] = arith.addi %[[BASECST]], %[[BCAST]] : vector<2x2xindex>
+    %cst_8x8 = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [4, 4], sg_data = [2, 2]>} dense<[
+         [0, 16, 32, 48, 64, 80, 96, 112],
+         [8, 24, 40, 56, 72, 88, 104, 120],
+         [16, 32, 48, 64, 80, 96, 112, 128],
+         [24, 40, 56, 72, 88, 104, 120, 136],
+         [32, 48, 64, 80, 96, 112, 128, 144],
+         [40, 56, 72, 88, 104, 120, 136, 152],
+         [48, 64, 80, 96, 112, 128, 144, 160],
+         [56, 72, 88, 104, 120, 136, 152, 168]
+      ]> : vector<8x8xindex>
+      gpu.return
+  }
+
+  // CHECK-LABEL: non_splat_constant
+  gpu.func @non_splat_constant() {
+    // CHECK-DAG: %[[CST:.*]] = arith.constant dense<0> : vector<1xindex>
+    // CHECK-DAG: %[[SGID:.*]] = gpu.subgroup_id : index
+    // CHECK-DAG: %[[REMU:.*]] = index.remu %[[SGID]], %[[C32:.*]]
+    // CHECK-DAG: %[[MUL:.*]] = arith.muli %[[REMU]], %[[C16:.*]] : index
+    // CHECK-DAG: %[[ADDSTRIDES:.*]] = arith.addi %[[C0:.*]], %[[MUL]] : index
+    // CHECK-DAG: %[[BCAST:.*]] = vector.broadcast %[[ADDSTRIDES]] : index to vector<1xindex>
+    // CHECK-DAG: %[[ADD:.*]] = arith.addi %[[CST]], %[[BCAST]] : vector<1xindex>
+    %cst = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [32], sg_data = [1]>} dense<[0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496]> : vector<32xindex>
+    // CHECK: arith.constant dense<{{\[}}[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]{{\]}}> : vector<1x16xindex>
+    %cst_1 = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [32, 1], sg_data = [1, 16]>} dense<[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]]> : vector<1x16xindex>
+    gpu.return
+  }
 }
diff --git a/mlir/test/Target/LLVMIR/rocdl.mlir b/mlir/test/Target/LLVMIR/rocdl.mlir
index 1c0c2eb..fdd2c91 100644
--- a/mlir/test/Target/LLVMIR/rocdl.mlir
+++ b/mlir/test/Target/LLVMIR/rocdl.mlir
@@ -192,6 +192,13 @@ llvm.func @rocdl.barrier() {
   llvm.return
 }
 
+llvm.func @rocdl.s.barrier.init(%ptr : !llvm.ptr<3>) {
+  // CHECK-LABEL: rocdl.s.barrier.init
+  // CHECK: call void @llvm.amdgcn.s.barrier.init(ptr addrspace(3) %[[PTR:.+]], i32 1)
+  rocdl.s.barrier.init %ptr, 1
+  llvm.return
+}
+
 llvm.func @rocdl.s.barrier.signal() {
   // CHECK-LABEL: rocdl.s.barrier.signal
   // CHECK-NEXT: call void @llvm.amdgcn.s.barrier.signal(i32 -1)
@@ -199,6 +206,27 @@ llvm.func @rocdl.s.barrier.signal() {
   llvm.return
 }
 
+llvm.func @rocdl.s.barrier.signal.var(%ptr : !llvm.ptr<3>) {
+  // CHECK-LABEL: rocdl.s.barrier.signal.var
+  // CHECK: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) %[[PTR:.+]], i32 1)
+  rocdl.s.barrier.signal.var %ptr, 1
+  llvm.return
+}
+
+llvm.func @rocdl.s.barrier.join(%ptr : !llvm.ptr<3>) {
+  // CHECK-LABEL: rocdl.s.barrier.join
+  // CHECK: call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) %[[PTR:.+]])
+  rocdl.s.barrier.join %ptr
+  llvm.return
+}
+
+llvm.func @rocdl.s.barrier.leave() {
+  // CHECK-LABEL: rocdl.s.barrier.leave
+  // CHECK: call void @llvm.amdgcn.s.barrier.leave(i16 1)
+  rocdl.s.barrier.leave 1
+  llvm.return
+}
+
 llvm.func @rocdl.s.barrier.wait() {
   // CHECK-LABEL: rocdl.s.barrier.wait
   // CHECK-NEXT: call void @llvm.amdgcn.s.barrier.wait(i16 -1)
@@ -206,6 +234,20 @@ llvm.func @rocdl.s.barrier.wait() {
   llvm.return
 }
 
+llvm.func @rocdl.s.barrier.signal.isfirst() {
+  // CHECK-LABEL: rocdl.s.barrier.signal.isfirst
+  // CHECK:  %[[OUT:.+]] = call i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32 1)
+  %0 = rocdl.s.barrier.signal.isfirst 1 : i1
+  llvm.return
+}
+
+llvm.func @rocdl.s.get.barrier.state() {
+  // CHECK-LABEL: rocdl.s.get.barrier.state
+  // CHECK: %[[STATE:.+]] = call i32 @llvm.amdgcn.s.get.barrier.state(i32 1)
+  %0 = rocdl.s.get.barrier.state 1 : i32
+  llvm.return
+}
+
 llvm.func @rocdl.s.wait.dscnt() {
   // CHECK-LABEL: rocdl.s.wait.dscnt
   // CHECK-NEXT: call void @llvm.amdgcn.s.wait.dscnt(i16 0)
diff --git a/mlir/test/lib/Dialect/Test/TestOps.td b/mlir/test/lib/Dialect/Test/TestOps.td
index 6ea27187..6329d61 100644
--- a/mlir/test/lib/Dialect/Test/TestOps.td
+++ b/mlir/test/lib/Dialect/Test/TestOps.td
@@ -1169,6 +1169,11 @@ def OpP : TEST_Op<"op_p"> {
   let results = (outs I32);
 }
 
+def OpQ : TEST_Op<"op_q"> {
+  let arguments = (ins AnyType, AnyType);
+  let results = (outs AnyType);
+}
+
 // Test constant-folding a pattern that maps `(F32) -> SI32`.
 def SignOp : TEST_Op<"sign", [SameOperandsAndResultShape]> {
   let arguments = (ins RankedTensorOf<[F32]>:$operand);
@@ -1207,6 +1212,14 @@ def TestNestedSameOpAndSameArgEqualityPattern :
 def TestMultipleEqualArgsPattern :
   Pat<(OpP $a, $b, $a, $a, $b, $c), (OpN $c, $b)>;
 
+// Test equal arguments checks are applied before user provided constraints.
+def AssertBinOpEqualArgsAndReturnTrue : Constraint<
+  CPred<"assertBinOpEqualArgsAndReturnTrue($0)">>;
+def TestEqualArgsCheckBeforeUserConstraintsPattern :
+  Pat<(OpQ:$op $x, $x),
+      (replaceWithValue $x),
+      [(AssertBinOpEqualArgsAndReturnTrue $op)]>;
+
 // Test for memrefs normalization of an op with normalizable memrefs.
 def OpNorm : TEST_Op<"op_norm", [MemRefsNormalizable]> {
   let arguments = (ins AnyMemRef:$X, AnyMemRef:$Y);
diff --git a/mlir/test/lib/Dialect/Test/TestPatterns.cpp b/mlir/test/lib/Dialect/Test/TestPatterns.cpp
index f8b5144..ee4fa39 100644
--- a/mlir/test/lib/Dialect/Test/TestPatterns.cpp
+++ b/mlir/test/lib/Dialect/Test/TestPatterns.cpp
@@ -70,6 +70,16 @@ static Attribute opMTest(PatternRewriter &rewriter, Value val) {
   return rewriter.getIntegerAttr(rewriter.getIntegerType(32), i);
 }
 
+static bool assertBinOpEqualArgsAndReturnTrue(Value v) {
+  Operation *operation = v.getDefiningOp();
+  if (operation->getOperand(0) != operation->getOperand(1)) {
+    // Name binding equality check must happen before user-defined constraints,
+    // thus this must not be triggered.
+    llvm::report_fatal_error("Arguments are not equal");
+  }
+  return true;
+}
+
 namespace {
 #include "TestPatterns.inc"
 } // namespace
diff --git a/mlir/test/mlir-tblgen/pattern.mlir b/mlir/test/mlir-tblgen/pattern.mlir
index bd55338..ffb78c2 100644
--- a/mlir/test/mlir-tblgen/pattern.mlir
+++ b/mlir/test/mlir-tblgen/pattern.mlir
@@ -156,16 +156,19 @@ func.func @verifyNestedOpEqualArgs(
   // def TestNestedOpEqualArgsPattern :
   //   Pat<(OpN $b, (OpP $a, $b, $c, $d, $e, $f)), (replaceWithValue $b)>;
 
-  // CHECK: %arg1
+  // CHECK: "test.op_o"(%arg1)
   %0 = "test.op_p"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5)
     : (i32, i32, i32, i32, i32, i32) -> (i32)
   %1 = "test.op_n"(%arg1, %0) : (i32, i32) -> (i32)
+  %2 = "test.op_o"(%1) : (i32) -> (i32)
 
-  // CHECK: test.op_p
-  // CHECK: test.op_n
-  %2 = "test.op_p"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5)
+  // CHECK-NEXT: %[[P:.*]] = "test.op_p"
+  // CHECK-NEXT: %[[N:.*]] = "test.op_n"(%arg0, %[[P]])
+  // CHECK-NEXT: "test.op_o"(%[[N]])
+  %3 = "test.op_p"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5)
     : (i32, i32, i32, i32, i32, i32) -> (i32)
-  %3 = "test.op_n"(%arg0, %2) : (i32, i32) -> (i32)
+  %4 = "test.op_n"(%arg0, %3) : (i32, i32) -> (i32)
+  %5 = "test.op_o"(%4) : (i32) -> (i32)
 
   return
 }
@@ -206,6 +209,21 @@ func.func @verifyMultipleEqualArgs(
   return
 }
 
+func.func @verifyEqualArgsCheckBeforeUserConstraints(%arg0: i32, %arg1: f32) {
+  // def TestEqualArgsCheckBeforeUserConstraintsPattern :
+  //   Pat<(OpQ:$op $x, $x),
+  //       (replaceWithValue $x),
+  //       [(AssertBinOpEqualArgsAndReturnTrue $op)]>;
+
+  // CHECK: "test.op_q"(%arg0, %arg1)
+  %0 = "test.op_q"(%arg0, %arg1) : (i32, f32) -> (i32)
+
+  // CHECK: "test.op_q"(%arg1, %arg0)
+  %1 = "test.op_q"(%arg1, %arg0) : (f32, i32) -> (i32)
+
+  return
+}
+
 //===----------------------------------------------------------------------===//
 // Test Symbol Binding
 //===----------------------------------------------------------------------===//
diff --git a/mlir/tools/mlir-tblgen/RewriterGen.cpp b/mlir/tools/mlir-tblgen/RewriterGen.cpp
index 605033d..40bc1a9 100644
--- a/mlir/tools/mlir-tblgen/RewriterGen.cpp
+++ b/mlir/tools/mlir-tblgen/RewriterGen.cpp
@@ -1024,6 +1024,32 @@ void PatternEmitter::emitMatchLogic(DagNode tree, StringRef opName) {
   int depth = 0;
   emitMatch(tree, opName, depth);
 
+  // Some of the operands could be bound to the same symbol name, we need
+  // to enforce equality constraint on those.
+  // This has to happen before user provided constraints, which may assume the
+  // same name checks are already performed, since in the pattern source code
+  // the user provided constraints appear later.
+  // TODO: we should be able to emit equality checks early
+  // and short circuit unnecessary work if vars are not equal.
+  for (auto symbolInfoIt = symbolInfoMap.begin();
+       symbolInfoIt != symbolInfoMap.end();) {
+    auto range = symbolInfoMap.getRangeOfEqualElements(symbolInfoIt->first);
+    auto startRange = range.first;
+    auto endRange = range.second;
+
+    auto firstOperand = symbolInfoIt->second.getVarName(symbolInfoIt->first);
+    for (++startRange; startRange != endRange; ++startRange) {
+      auto secondOperand = startRange->second.getVarName(symbolInfoIt->first);
+      emitMatchCheck(
+          opName,
+          formatv("*{0}.begin() == *{1}.begin()", firstOperand, secondOperand),
+          formatv("\"Operands '{0}' and '{1}' must be equal\"", firstOperand,
+                  secondOperand));
+    }
+
+    symbolInfoIt = endRange;
+  }
+
   for (auto &appliedConstraint : pattern.getConstraints()) {
     auto &constraint = appliedConstraint.constraint;
     auto &entities = appliedConstraint.entities;
@@ -1068,29 +1094,6 @@ void PatternEmitter::emitMatchLogic(DagNode tree, StringRef opName) {
     }
   }
 
-  // Some of the operands could be bound to the same symbol name, we need
-  // to enforce equality constraint on those.
-  // TODO: we should be able to emit equality checks early
-  // and short circuit unnecessary work if vars are not equal.
-  for (auto symbolInfoIt = symbolInfoMap.begin();
-       symbolInfoIt != symbolInfoMap.end();) {
-    auto range = symbolInfoMap.getRangeOfEqualElements(symbolInfoIt->first);
-    auto startRange = range.first;
-    auto endRange = range.second;
-
-    auto firstOperand = symbolInfoIt->second.getVarName(symbolInfoIt->first);
-    for (++startRange; startRange != endRange; ++startRange) {
-      auto secondOperand = startRange->second.getVarName(symbolInfoIt->first);
-      emitMatchCheck(
-          opName,
-          formatv("*{0}.begin() == *{1}.begin()", firstOperand, secondOperand),
-          formatv("\"Operands '{0}' and '{1}' must be equal\"", firstOperand,
-                  secondOperand));
-    }
-
-    symbolInfoIt = endRange;
-  }
-
   LLVM_DEBUG(llvm::dbgs() << "--- done emitting match logic ---\n");
 }
 
diff --git a/offload/test/offloading/fortran/descriptor-stack-jam-regression.f90 b/offload/test/offloading/fortran/descriptor-stack-jam-regression.f90
new file mode 100644
index 0000000..45a18b7
--- /dev/null
+++ b/offload/test/offloading/fortran/descriptor-stack-jam-regression.f90
@@ -0,0 +1,101 @@
+! This test doesn't expect any results, the pass condition is running to completion
+! without any memory access errors on device or mapping issues from descriptor
+! collisions due to local descriptors being placed on device and not being unampped
+! before a subsequent local descriptor residing at the same address is mapped to
+! device.
+! REQUIRES: flang, amdgpu
+
+! RUN: %libomptarget-compile-fortran-run-and-check-generic
+module test
+contains
+    subroutine kernel_1d(array)
+        implicit none
+        real, dimension(:) :: array
+        integer :: i
+
+        !$omp target enter data map(alloc:array)
+        !$omp target teams distribute parallel do
+        do i=1, ubound(array, 1)
+            array(i) = 42.0
+        end do
+        !$omp target update from(array)
+    end subroutine
+
+    subroutine kernel_2d(array)
+        implicit none
+        real, dimension(:,:) :: array
+        integer :: i, j
+
+        !$omp target enter data map(alloc:array)
+        !$omp target teams distribute parallel do collapse(2)
+        do j=1, ubound(array, 2)
+            do i=1, ubound(array, 1)
+                array(i,j) = 42.0
+            end do
+        end do
+        !$omp target update from(array)
+    end subroutine
+
+    subroutine kernel_3d(array)
+        implicit none
+        real, dimension(:,:,:) :: array
+        integer :: i, j, k
+
+        !$omp target enter data map(alloc:array)
+        !$omp target teams distribute parallel do collapse(3)
+        do k=1, ubound(array, 3)
+            do j=1, ubound(array, 2)
+                do i=1, ubound(array, 1)
+                    array(i,j,k) = 42.0
+                end do
+            end do
+        end do
+        !$omp target update from(array)
+    end subroutine
+
+    subroutine kernel_4d(array)
+        implicit none
+        real, dimension(:,:,:,:) :: array
+        integer :: i, j, k, l
+
+        !$omp target enter data map(alloc:array)
+        !$omp target teams distribute parallel do collapse(4)
+        do l=1, ubound(array, 4)
+            do k=1, ubound(array, 3)
+                do j=1, ubound(array, 2)
+                    do i=1, ubound(array, 1)
+                        array(i,j,k,l) = 42.0
+                    end do
+                end do
+            end do
+        enddo
+        !$omp target update from(array)
+    end subroutine
+end module
+
+program main
+    use test
+    implicit none
+    integer, parameter :: n = 2
+    real :: array1(n)
+    real :: array2(n,n)
+    real :: array3(n,n,n)
+    real :: array4(n,n,n,n)
+
+    call kernel_1d(array1)
+    call kernel_2d(array2)
+    call kernel_3d(array3)
+    call kernel_4d(array4)
+
+    print *, array1
+    print *, array2
+    print *, array3
+    print *, array4
+    print *, "PASS"
+end program
+
+! CHECK: 42. 42.
+! CHECK: 42. 42. 42. 42.
+! CHECK: 42. 42. 42. 42. 42. 42. 42. 42.
+! CHECK: 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42. 42.
+! CHECK: PASS
diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
index ea269da..640fa03 100644
--- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
@@ -1496,6 +1496,7 @@ libc_support_library(
     deps = [
         ":__support_common",
         ":__support_cpp_bit",
+        ":__support_macros_optimization",
         ":__support_macros_sanitizer",
     ],
 )