225 files changed, 8377 insertions, 2578 deletions
diff --git a/.github/workflows/release-asset-audit.py b/.github/workflows/release-asset-audit.py
index 23b901a..0cff6c6 100644
--- a/.github/workflows/release-asset-audit.py
+++ b/.github/workflows/release-asset-audit.py
@@ -54,6 +54,8 @@ def _get_uploaders(release_version):
                 "tru",
                 "tstellar",
                 "github-actions[bot]",
+                "c-rhodes",
+                "dyung",
             ]
         )
 
diff --git a/bolt/test/perf2bolt/AArch64/perf2bolt-spe.test b/bolt/test/perf2bolt/AArch64/perf2bolt-spe.test
index 91f5c85..1f44f75 100644
--- a/bolt/test/perf2bolt/AArch64/perf2bolt-spe.test
+++ b/bolt/test/perf2bolt/AArch64/perf2bolt-spe.test
@@ -6,7 +6,6 @@ RUN: %clang %cflags %p/../../Inputs/asm_foo.s %p/../../Inputs/asm_main.c -o %t.e
 
 RUN: perf record -e cycles -q -o %t.perf.data -- %t.exe 2> /dev/null
 
-RUN: (perf2bolt -p %t.perf.data -o %t.perf.boltdata --spe %t.exe 2> /dev/null; exit 0) | FileCheck %s --check-prefix=CHECK-SPE-LBR
+RUN: perf2bolt -p %t.perf.data -o %t.perf.boltdata --spe %t.exe | FileCheck %s --check-prefix=CHECK-SPE-LBR
 
 CHECK-SPE-LBR: PERF2BOLT: parse SPE branch events in LBR-format
-
diff --git a/clang/docs/AllocToken.rst b/clang/docs/AllocToken.rst
new file mode 100644
index 0000000..fb5c060
--- /dev/null
+++ b/clang/docs/AllocToken.rst
@@ -0,0 +1,173 @@
+=================
+Allocation Tokens
+=================
+
+.. contents::
+   :local:
+
+Introduction
+============
+
+Clang provides support for allocation tokens to enable allocator-level heap
+organization strategies. Clang assigns mode-dependent token IDs to allocation
+calls; the runtime behavior depends entirely on the implementation of a
+compatible memory allocator.
+
+Possible allocator strategies include:
+
+* **Security Hardening**: Placing allocations into separate, isolated heap
+  partitions. For example, separating pointer-containing types from raw data
+  can mitigate exploits that rely on overflowing a primitive buffer to corrupt
+  object metadata.
+
+* **Memory Layout Optimization**: Grouping related allocations to improve data
+  locality and cache utilization.
+
+* **Custom Allocation Policies**: Applying different management strategies to
+  different partitions.
+
+Token Assignment Mode
+=====================
+
+The default mode to calculate tokens is:
+
+* ``typehash``: This mode assigns a token ID based on the hash of the allocated
+  type's name.
+
+Other token ID assignment modes are supported, but they may be subject to
+change or removal. These may (experimentally) be selected with ``-mllvm
+-alloc-token-mode=<mode>``:
+
+* ``random``: This mode assigns a statically-determined random token ID to each
+  allocation site.
+
+* ``increment``: This mode assigns a simple, incrementally increasing token ID
+  to each allocation site.
+
+Allocation Token Instrumentation
+================================
+
+To enable instrumentation of allocation functions, code can be compiled with
+the ``-fsanitize=alloc-token`` flag:
+
+.. code-block:: console
+
+    % clang++ -fsanitize=alloc-token example.cc
+
+The instrumentation transforms allocation calls to include a token ID. For
+example:
+
+.. code-block:: c
+
+    // Original:
+    ptr = malloc(size);
+
+    // Instrumented:
+    ptr = __alloc_token_malloc(size, <token id>);
+
+The following command-line options affect generated token IDs:
+
+* ``-falloc-token-max=<N>``
+    Configures the maximum number of tokens. No max by default (tokens bounded
+    by ``SIZE_MAX``).
+
+    .. code-block:: console
+
+        % clang++ -fsanitize=alloc-token -falloc-token-max=512 example.cc
+
+Runtime Interface
+-----------------
+
+A compatible runtime must be provided that implements the token-enabled
+allocation functions. The instrumentation generates calls to functions that
+take a final ``size_t token_id`` argument.
+
+.. code-block:: c
+
+    // C standard library functions
+    void *__alloc_token_malloc(size_t size, size_t token_id);
+    void *__alloc_token_calloc(size_t count, size_t size, size_t token_id);
+    void *__alloc_token_realloc(void *ptr, size_t size, size_t token_id);
+    // ...
+
+    // C++ operators (mangled names)
+    // operator new(size_t, size_t)
+    void *__alloc_token__Znwm(size_t size, size_t token_id);
+    // operator new[](size_t, size_t)
+    void *__alloc_token__Znam(size_t size, size_t token_id);
+    // ... other variants like nothrow, etc., are also instrumented.
+
+Fast ABI
+--------
+
+An alternative ABI can be enabled with ``-fsanitize-alloc-token-fast-abi``,
+which encodes the token ID hint in the allocation function name.
+
+.. code-block:: c
+
+    void *__alloc_token_0_malloc(size_t size);
+    void *__alloc_token_1_malloc(size_t size);
+    void *__alloc_token_2_malloc(size_t size);
+    ...
+    void *__alloc_token_0_Znwm(size_t size);
+    void *__alloc_token_1_Znwm(size_t size);
+    void *__alloc_token_2_Znwm(size_t size);
+    ...
+
+This ABI provides a more efficient alternative where
+``-falloc-token-max`` is small.
+
+Disabling Instrumentation
+-------------------------
+
+To exclude specific functions from instrumentation, you can use the
+``no_sanitize("alloc-token")`` attribute:
+
+.. code-block:: c
+
+    __attribute__((no_sanitize("alloc-token")))
+    void* custom_allocator(size_t size) {
+        return malloc(size);  // Uses original malloc
+    }
+
+Note: Independent of any given allocator support, the instrumentation aims to
+remain performance neutral. As such, ``no_sanitize("alloc-token")``
+functions may be inlined into instrumented functions and vice-versa. If
+correctness is affected, such functions should explicitly be marked
+``noinline``.
+
+The ``__attribute__((disable_sanitizer_instrumentation))`` is also supported to
+disable this and other sanitizer instrumentations.
+
+Suppressions File (Ignorelist)
+------------------------------
+
+AllocToken respects the ``src`` and ``fun`` entity types in the
+:doc:`SanitizerSpecialCaseList`, which can be used to omit specified source
+files or functions from instrumentation.
+
+.. code-block:: bash
+
+    [alloc-token]
+    # Exclude specific source files
+    src:third_party/allocator.c
+    # Exclude function name patterns
+    fun:*custom_malloc*
+    fun:LowLevel::*
+
+.. code-block:: console
+
+    % clang++ -fsanitize=alloc-token -fsanitize-ignorelist=my_ignorelist.txt example.cc
+
+Conditional Compilation with ``__SANITIZE_ALLOC_TOKEN__``
+-----------------------------------------------------------
+
+In some cases, one may need to execute different code depending on whether
+AllocToken instrumentation is enabled. The ``__SANITIZE_ALLOC_TOKEN__`` macro
+can be used for this purpose.
+
+.. code-block:: c
+
+    #ifdef __SANITIZE_ALLOC_TOKEN__
+    // Code specific to -fsanitize=alloc-token builds
+    #endif
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 5e9a71e..9a0d69c 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -257,10 +257,16 @@ Non-comprehensive list of changes in this release
 
 - Fixed a crash when the second argument to ``__builtin_assume_aligned`` was not constant (#GH161314)
 
+- Introduce support for :doc:`allocation tokens <AllocToken>` to enable
+  allocator-level heap organization strategies. A feature to instrument all
+  allocation functions with a token ID can be enabled via the
+  ``-fsanitize=alloc-token`` flag.
+
 New Compiler Flags
 ------------------
 - New option ``-fno-sanitize-debug-trap-reasons`` added to disable emitting trap reasons into the debug info when compiling with trapping UBSan (e.g. ``-fsanitize-trap=undefined``).
 - New option ``-fsanitize-debug-trap-reasons=`` added to control emitting trap reasons into the debug info when compiling with trapping UBSan (e.g. ``-fsanitize-trap=undefined``).
+- New options for enabling allocation token instrumentation: ``-fsanitize=alloc-token``, ``-falloc-token-max=``, ``-fsanitize-alloc-token-fast-abi``, ``-fsanitize-alloc-token-extended``.
 
 
 Lanai Support
diff --git a/clang/docs/UsersManual.rst b/clang/docs/UsersManual.rst
index a8bbf14..12c2ada 100644
--- a/clang/docs/UsersManual.rst
+++ b/clang/docs/UsersManual.rst
@@ -2155,13 +2155,11 @@ are listed below.
 
 .. option:: -f[no-]sanitize=check1,check2,...
 
-   Turn on runtime checks for various forms of undefined or suspicious
-   behavior.
+   Turn on runtime checks or mitigations for various forms of undefined or
+   suspicious behavior. These are disabled by default.
 
-   This option controls whether Clang adds runtime checks for various
-   forms of undefined or suspicious behavior, and is disabled by
-   default. If a check fails, a diagnostic message is produced at
-   runtime explaining the problem. The main checks are:
+   The following options enable runtime checks for various forms of undefined
+   or suspicious behavior:
 
    -  .. _opt_fsanitize_address:
 
@@ -2195,6 +2193,14 @@ are listed below.
    -  ``-fsanitize=realtime``: :doc:`RealtimeSanitizer`,
       a real-time safety checker.
 
+   The following options enable runtime mitigations for various forms of
+   undefined or suspicious behavior:
+
+   -  ``-fsanitize=alloc-token``: Enables :doc:`allocation tokens <AllocToken>`
+      for allocator-level heap organization strategies, such as for security
+      hardening. It passes type-derived token IDs to a compatible memory
+      allocator. Requires linking against a token-aware allocator.
+
    There are more fine-grained checks available: see
    the :ref:`list <ubsan-checks>` of specific kinds of
    undefined behavior that can be detected and the :ref:`list <cfi-schemes>`
diff --git a/clang/docs/index.rst b/clang/docs/index.rst
index e238518..272ae54 100644
--- a/clang/docs/index.rst
+++ b/clang/docs/index.rst
@@ -40,6 +40,7 @@ Using Clang as a Compiler
    SanitizerCoverage
    SanitizerStats
    SanitizerSpecialCaseList
+   AllocToken
    BoundsSafety
    BoundsSafetyAdoptionGuide
    BoundsSafetyImplPlans
diff --git a/clang/include/clang/Analysis/Analyses/UnsafeBufferUsage.h b/clang/include/clang/Analysis/Analyses/UnsafeBufferUsage.h
index 9b53f1d..ea41eb3 100644
--- a/clang/include/clang/Analysis/Analyses/UnsafeBufferUsage.h
+++ b/clang/include/clang/Analysis/Analyses/UnsafeBufferUsage.h
@@ -14,6 +14,7 @@
 #ifndef LLVM_CLANG_ANALYSIS_ANALYSES_UNSAFEBUFFERUSAGE_H
 #define LLVM_CLANG_ANALYSIS_ANALYSES_UNSAFEBUFFERUSAGE_H
 
+#include "clang/AST/ASTTypeTraits.h"
 #include "clang/AST/Decl.h"
 #include "clang/AST/Expr.h"
 #include "clang/AST/Stmt.h"
@@ -139,6 +140,12 @@ public:
                             FixItList &&Fixes, const Decl *D,
                             const FixitStrategy &VarTargetTypes) = 0;
 
+  // Invoked when an array subscript operator[] is used on a
+  // std::unique_ptr<T[]>.
+  virtual void handleUnsafeUniquePtrArrayAccess(const DynTypedNode &Node,
+                                                bool IsRelatedToDecl,
+                                                ASTContext &Ctx) = 0;
+
 #ifndef NDEBUG
 public:
   bool areDebugNotesRequested() {
diff --git a/clang/include/clang/Analysis/Analyses/UnsafeBufferUsageGadgets.def b/clang/include/clang/Analysis/Analyses/UnsafeBufferUsageGadgets.def
index 09fa510..fae5f8b 100644
--- a/clang/include/clang/Analysis/Analyses/UnsafeBufferUsageGadgets.def
+++ b/clang/include/clang/Analysis/Analyses/UnsafeBufferUsageGadgets.def
@@ -38,6 +38,7 @@ WARNING_GADGET(PointerArithmetic)
 WARNING_GADGET(UnsafeBufferUsageAttr)
 WARNING_GADGET(UnsafeBufferUsageCtorAttr)
 WARNING_GADGET(DataInvocation)
+WARNING_GADGET(UniquePtrArrayAccess)
 WARNING_OPTIONAL_GADGET(UnsafeLibcFunctionCall)
 WARNING_OPTIONAL_GADGET(SpanTwoParamConstructor) // Uses of `std::span(arg0, arg1)`
 FIXABLE_GADGET(ULCArraySubscript)          // `DRE[any]` in an Unspecified Lvalue Context
diff --git a/clang/include/clang/Basic/BuiltinsX86.td b/clang/include/clang/Basic/BuiltinsX86.td
index 4165225..217589d 100644
--- a/clang/include/clang/Basic/BuiltinsX86.td
+++ b/clang/include/clang/Basic/BuiltinsX86.td
@@ -123,13 +123,16 @@ let Attributes = [Const, NoThrow, RequiredVectorWidth<128>] in {
       def Op#d128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>)">;
     }
 
-    def pmaddubsw128 : X86Builtin<"_Vector<8, short>(_Vector<16, char>, _Vector<16, char>)">;
     def pmulhrsw128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>)">;
     def pshufb128 : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Vector<16, char>)">;
     def psignb128 : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Vector<16, char>)">;
     def psignw128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>)">;
     def psignd128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>)">;
   }
+
+  let Features = "ssse3", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in {
+    def pmaddubsw128 : X86Builtin<"_Vector<8, short>(_Vector<16, char>, _Vector<16, char>)">;
+  }
 }
 
 // AVX
@@ -278,13 +281,14 @@ let Features = "sse2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] i
   def psllw128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>)">;
   def pslld128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>)">;
   def psllq128 : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>)">;
-  def pmaddwd128 : X86Builtin<"_Vector<4, int>(_Vector<8, short>, _Vector<8, short>)">;
   def pslldqi128_byteshift : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Constant int)">;
   def psrldqi128_byteshift : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Constant int)">;
 }
 
 let Features = "sse2",
     Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in {
+  def pmaddwd128 : X86Builtin<"_Vector<4, int>(_Vector<8, short>, _Vector<8, short>)">;
+  
   def pmuludq128 : X86Builtin<"_Vector<2, long long int>(_Vector<4, int>, _Vector<4, int>)">;
 
   def psllwi128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, int)">;
@@ -581,8 +585,6 @@ let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] i
   def phsubw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">;
   def phsubd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>)">;
   def phsubsw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">;
-  def pmaddubsw256 : X86Builtin<"_Vector<16, short>(_Vector<32, char>, _Vector<32, char>)">;
-  def pmaddwd256 : X86Builtin<"_Vector<8, int>(_Vector<16, short>, _Vector<16, short>)">;
   def pmovmskb256 : X86Builtin<"int(_Vector<32, char>)">;
   def pmulhrsw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">;
   def psadbw256 : X86Builtin<"_Vector<4, long long int>(_Vector<32, char>, _Vector<32, char>)">;
@@ -619,6 +621,9 @@ let Features = "avx2", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWi
 
   def pblendvb256 : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Vector<32, char>, _Vector<32, char>)">;
 
+  def pmaddubsw256 : X86Builtin<"_Vector<16, short>(_Vector<32, char>, _Vector<32, char>)">;
+  def pmaddwd256 : X86Builtin<"_Vector<8, int>(_Vector<16, short>, _Vector<16, short>)">;
+
   def pmuldq256 : X86Builtin<"_Vector<4, long long int>(_Vector<8, int>, _Vector<8, int>)">;
   def pmuludq256 : X86Builtin<"_Vector<4, long long int>(_Vector<8, int>, _Vector<8, int>)">;
 
@@ -1378,10 +1383,6 @@ let Features = "avx512f", Attributes = [NoThrow, Const, RequiredVectorWidth<512>
   def subps512 : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<16, float>, _Constant int)">;
 }
 
-let Features = "avx512bw", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
-  def pmaddubsw512 : X86Builtin<"_Vector<32, short>(_Vector<64, char>, _Vector<64, char>)">;
-  def pmaddwd512 : X86Builtin<"_Vector<16, int>(_Vector<32, short>, _Vector<32, short>)">;
-}
 
 let Features = "avx512f", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
   def addss_round_mask : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>, unsigned char, _Constant int)">;
@@ -1999,6 +2000,8 @@ let Features = "avx512bw", Attributes = [NoThrow, Const, RequiredVectorWidth<512
 }
 
 let Features = "avx512bw", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in {
+  def pmaddubsw512 : X86Builtin<"_Vector<32, short>(_Vector<64, char>, _Vector<64, char>)">;
+  def pmaddwd512 : X86Builtin<"_Vector<16, int>(_Vector<32, short>, _Vector<32, short>)">;
   def psllv32hi : X86Builtin<"_Vector<32, short>(_Vector<32, short>, _Vector<32, short>)">;
   def pshufhw512 : X86Builtin<"_Vector<32, short>(_Vector<32, short>, _Constant int)">;
   def pshuflw512 : X86Builtin<"_Vector<32, short>(_Vector<32, short>, _Constant int)">;
diff --git a/clang/include/clang/Basic/CodeGenOptions.def b/clang/include/clang/Basic/CodeGenOptions.def
index 872f73e..d924cb4 100644
--- a/clang/include/clang/Basic/CodeGenOptions.def
+++ b/clang/include/clang/Basic/CodeGenOptions.def
@@ -306,6 +306,8 @@ CODEGENOPT(SanitizeBinaryMetadataCovered, 1, 0, Benign) ///< Emit PCs for covere
 CODEGENOPT(SanitizeBinaryMetadataAtomics, 1, 0, Benign) ///< Emit PCs for atomic operations.
 CODEGENOPT(SanitizeBinaryMetadataUAR, 1, 0, Benign) ///< Emit PCs for start of functions
                                                     ///< that are subject for use-after-return checking.
+CODEGENOPT(SanitizeAllocTokenFastABI, 1, 0, Benign) ///< Use the AllocToken fast ABI.
+CODEGENOPT(SanitizeAllocTokenExtended, 1, 0, Benign) ///< Extend coverage to custom allocation functions.
 CODEGENOPT(SanitizeStats     , 1, 0, Benign) ///< Collect statistics for sanitizers.
 ENUM_CODEGENOPT(SanitizeDebugTrapReasons, SanitizeDebugTrapReasonKind, 2, SanitizeDebugTrapReasonKind::Detailed, Benign) ///< Control how "trap reasons" are emitted in debug info
 CODEGENOPT(SimplifyLibCalls  , 1, 1, Benign) ///< Set when -fbuiltin is enabled.
diff --git a/clang/include/clang/Basic/CodeGenOptions.h b/clang/include/clang/Basic/CodeGenOptions.h
index 5d5cf25..cae06c3 100644
--- a/clang/include/clang/Basic/CodeGenOptions.h
+++ b/clang/include/clang/Basic/CodeGenOptions.h
@@ -447,6 +447,10 @@ public:
 
   std::optional<double> AllowRuntimeCheckSkipHotCutoff;
 
+  /// Maximum number of allocation tokens (0 = no max), nullopt if none set (use
+  /// pass default).
+  std::optional<uint64_t> AllocTokenMax;
+
   /// List of backend command-line options for -fembed-bitcode.
   std::vector<uint8_t> CmdArgs;
 
diff --git a/clang/include/clang/Basic/DiagnosticGroups.td b/clang/include/clang/Basic/DiagnosticGroups.td
index 0c994e0..4b27a42 100644
--- a/clang/include/clang/Basic/DiagnosticGroups.td
+++ b/clang/include/clang/Basic/DiagnosticGroups.td
@@ -1750,7 +1750,8 @@ def ReadOnlyPlacementChecks : DiagGroup<"read-only-types">;
 // Warnings and fixes to support the "safe buffers" programming model.
 def UnsafeBufferUsageInContainer : DiagGroup<"unsafe-buffer-usage-in-container">;
 def UnsafeBufferUsageInLibcCall : DiagGroup<"unsafe-buffer-usage-in-libc-call">;
-def UnsafeBufferUsage : DiagGroup<"unsafe-buffer-usage", [UnsafeBufferUsageInContainer, UnsafeBufferUsageInLibcCall]>;
+def UnsafeBufferUsageInUniquePtrArrayAccess : DiagGroup<"unsafe-buffer-usage-in-unique-ptr-array-access">;
+def UnsafeBufferUsage : DiagGroup<"unsafe-buffer-usage", [UnsafeBufferUsageInContainer, UnsafeBufferUsageInLibcCall, UnsafeBufferUsageInUniquePtrArrayAccess]>;
 
 // Warnings and notes InstallAPI verification.
 def InstallAPIViolation : DiagGroup<"installapi-violation">;
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index b157cbb..5be63c0 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -13295,6 +13295,8 @@ def note_safe_buffer_usage_suggestions_disabled : Note<
 def warn_unsafe_buffer_usage_in_container : Warning<
   "the two-parameter std::span construction is unsafe as it can introduce mismatch between buffer size and the bound information">,
   InGroup<UnsafeBufferUsageInContainer>, DefaultIgnore;
+def warn_unsafe_buffer_usage_unique_ptr_array_access : Warning<"direct access using operator[] on std::unique_ptr<T[]> is unsafe due to lack of bounds checking">,
+  InGroup<UnsafeBufferUsageInUniquePtrArrayAccess>, DefaultIgnore;
 #ifndef NDEBUG
 // Not a user-facing diagnostic. Useful for debugging false negatives in
 // -fsafe-buffer-usage-suggestions (i.e. lack of -Wunsafe-buffer-usage fixits).
diff --git a/clang/include/clang/Basic/Sanitizers.def b/clang/include/clang/Basic/Sanitizers.def
index 1d0e97c..da85431 100644
--- a/clang/include/clang/Basic/Sanitizers.def
+++ b/clang/include/clang/Basic/Sanitizers.def
@@ -195,6 +195,9 @@ SANITIZER_GROUP("bounds", Bounds, ArrayBounds | LocalBounds)
 // Scudo hardened allocator
 SANITIZER("scudo", Scudo)
 
+// AllocToken
+SANITIZER("alloc-token", AllocToken)
+
 // Magic group, containing all sanitizers. For example, "-fno-sanitize=all"
 // can be used to disable all the sanitizers.
 SANITIZER_GROUP("all", All, ~SanitizerMask())
diff --git a/clang/include/clang/CIR/Dialect/IR/CIRAttrs.td b/clang/include/clang/CIR/Dialect/IR/CIRAttrs.td
index 43832b7..bb62223 100644
--- a/clang/include/clang/CIR/Dialect/IR/CIRAttrs.td
+++ b/clang/include/clang/CIR/Dialect/IR/CIRAttrs.td
@@ -770,6 +770,51 @@ def CIR_VisibilityAttr : CIR_EnumAttr<CIR_VisibilityKind, "visibility"> {
 }
 
 //===----------------------------------------------------------------------===//
+// GloblCtorAttr
+//===----------------------------------------------------------------------===//
+
+class CIR_GlobalCtorDtor<string name, string attrMnemonic>
+    : CIR_Attr<"Global" # name, "global_" # attrMnemonic> {
+  let parameters = (ins "mlir::StringAttr":$name, "int":$priority);
+
+  let skipDefaultBuilders = 1;
+  let builders = [
+    AttrBuilder<(ins
+        "llvm::StringRef":$name,
+        CArg<"int", "65535">:$priority), [{
+      return $_get($_ctxt, mlir::StringAttr::get($_ctxt, name), priority);
+    }]>,
+    AttrBuilderWithInferredContext<(ins
+        "mlir::StringAttr":$name,
+        CArg<"int", "65535">:$priority), [{
+      return $_get(name.getContext(), name, priority);
+    }]>
+  ];
+
+  let assemblyFormat = [{
+    `<` $name `,` $priority `>`
+  }];
+
+  let extraClassDeclaration = [{
+    bool isDefaultPriority() const {
+      return getPriority() == getDefaultPriority();
+    };
+
+    static int getDefaultPriority() {
+      return 65535;
+    }
+  }];
+}
+
+def CIR_GlobalCtorAttr : CIR_GlobalCtorDtor<"Ctor", "ctor"> {
+  let summary = "Marks a function as a global constructor";
+  let description = [{
+    Marks the function as a global constructor in the module's constructor list.
+    It will be executed before main() is called.
+  }];
+}
+
+//===----------------------------------------------------------------------===//
 // BitfieldInfoAttr
 //===----------------------------------------------------------------------===//
 
diff --git a/clang/include/clang/CIR/Dialect/IR/CIRDataLayout.h b/clang/include/clang/CIR/Dialect/IR/CIRDataLayout.h
index 417a226..5c6ce7a 100644
--- a/clang/include/clang/CIR/Dialect/IR/CIRDataLayout.h
+++ b/clang/include/clang/CIR/Dialect/IR/CIRDataLayout.h
@@ -14,6 +14,7 @@
 
 #include "mlir/Dialect/DLTI/DLTI.h"
 #include "mlir/IR/BuiltinOps.h"
+#include "clang/CIR/Dialect/IR/CIRTypes.h"
 
 namespace cir {
 
@@ -81,6 +82,18 @@ public:
   }
 
   llvm::TypeSize getTypeSizeInBits(mlir::Type ty) const;
+
+  llvm::TypeSize getPointerTypeSizeInBits(mlir::Type ty) const {
+    assert(mlir::isa<cir::PointerType>(ty) &&
+           "This should only be called with a pointer type");
+    return layout.getTypeSizeInBits(ty);
+  }
+
+  mlir::Type getIntPtrType(mlir::Type ty) const {
+    assert(mlir::isa<cir::PointerType>(ty) && "Expected pointer type");
+    return cir::IntType::get(ty.getContext(), getPointerTypeSizeInBits(ty),
+                             false);
+  }
 };
 
 } // namespace cir
diff --git a/clang/include/clang/CIR/Dialect/IR/CIRDialect.td b/clang/include/clang/CIR/Dialect/IR/CIRDialect.td
index 15d5fa0..feb08d60 100644
--- a/clang/include/clang/CIR/Dialect/IR/CIRDialect.td
+++ b/clang/include/clang/CIR/Dialect/IR/CIRDialect.td
@@ -42,6 +42,7 @@ def CIR_Dialect : Dialect {
     static llvm::StringRef getNoThrowAttrName() { return "nothrow"; }
     static llvm::StringRef getSideEffectAttrName() { return "side_effect"; }
     static llvm::StringRef getModuleLevelAsmAttrName() { return "cir.module_asm"; }
+    static llvm::StringRef getGlobalCtorsAttrName() { return "cir.global_ctors"; }
 
     void registerAttributes();
     void registerTypes();
diff --git a/clang/include/clang/CIR/MissingFeatures.h b/clang/include/clang/CIR/MissingFeatures.h
index f795800..3b7b130 100644
--- a/clang/include/clang/CIR/MissingFeatures.h
+++ b/clang/include/clang/CIR/MissingFeatures.h
@@ -39,7 +39,6 @@ struct MissingFeatures {
   static bool opGlobalUsedOrCompilerUsed() { return false; }
   static bool opGlobalAnnotations() { return false; }
   static bool opGlobalDtorLowering() { return false; }
-  static bool opGlobalCtorAttr() { return false; }
   static bool opGlobalCtorPriority() { return false; }
   static bool opGlobalCtorList() { return false; }
   static bool setDSOLocal() { return false; }
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 60c4ad4..ec38231 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -2731,8 +2731,25 @@ def fsanitize_skip_hot_cutoff_EQ
           "(0.0 [default] = skip none; 1.0 = skip all). "
           "Argument format: <sanitizer1>=<value1>,<sanitizer2>=<value2>,...">;
 
+defm sanitize_alloc_token_fast_abi : BoolOption<"f", "sanitize-alloc-token-fast-abi",
+  CodeGenOpts<"SanitizeAllocTokenFastABI">, DefaultFalse,
+  PosFlag<SetTrue, [], [ClangOption], "Use the AllocToken fast ABI">,
+  NegFlag<SetFalse, [], [ClangOption], "Use the default AllocToken ABI">>,
+  Group<f_clang_Group>;
+defm sanitize_alloc_token_extended : BoolOption<"f", "sanitize-alloc-token-extended",
+  CodeGenOpts<"SanitizeAllocTokenExtended">, DefaultFalse,
+  PosFlag<SetTrue, [], [ClangOption], "Enable">,
+  NegFlag<SetFalse, [], [ClangOption], "Disable">,
+  BothFlags<[], [ClangOption], " extended coverage to custom allocation functions">>,
+  Group<f_clang_Group>;
+
 } // end -f[no-]sanitize* flags
 
+def falloc_token_max_EQ : Joined<["-"], "falloc-token-max=">,
+  Group<f_Group>, Visibility<[ClangOption, CC1Option]>,
+  MetaVarName<"<N>">,
+  HelpText<"Limit to maximum N allocation tokens (0 = no max)">;
+
 def fallow_runtime_check_skip_hot_cutoff_EQ
     : Joined<["-"], "fallow-runtime-check-skip-hot-cutoff=">,
       Group<f_clang_Group>,
@@ -4715,6 +4732,10 @@ def gdwarf_4 : Flag<["-"], "gdwarf-4">, Group<g_Group>,
   HelpText<"Generate source-level debug information with dwarf version 4">;
 def gdwarf_5 : Flag<["-"], "gdwarf-5">, Group<g_Group>,
   HelpText<"Generate source-level debug information with dwarf version 5">;
+def gdwarf_6
+    : Flag<["-"], "gdwarf-6">,
+      Group<g_Group>,
+      HelpText<"Generate source-level debug information with dwarf version 6">;
 }
 def gdwarf64 : Flag<["-"], "gdwarf64">, Group<g_Group>,
   Visibility<[ClangOption, CC1Option, CC1AsOption]>,
diff --git a/clang/include/clang/Driver/SanitizerArgs.h b/clang/include/clang/Driver/SanitizerArgs.h
index 2b72268..eea7897 100644
--- a/clang/include/clang/Driver/SanitizerArgs.h
+++ b/clang/include/clang/Driver/SanitizerArgs.h
@@ -75,6 +75,8 @@ class SanitizerArgs {
       llvm::AsanDetectStackUseAfterReturnMode::Invalid;
 
   std::string MemtagMode;
+  bool AllocTokenFastABI = false;
+  bool AllocTokenExtended = false;
 
 public:
   /// Parses the sanitizer arguments from an argument list.
diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
index 1eea813..9125250 100644
--- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp
+++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
@@ -678,30 +678,6 @@ static bool interp__builtin_popcount(InterpState &S, CodePtr OpPC,
   return true;
 }
 
-static bool interp__builtin_parity(InterpState &S, CodePtr OpPC,
-                                   const InterpFrame *Frame,
-                                   const CallExpr *Call) {
-  APSInt Val = popToAPSInt(S, Call->getArg(0));
-  pushInteger(S, Val.popcount() % 2, Call->getType());
-  return true;
-}
-
-static bool interp__builtin_clrsb(InterpState &S, CodePtr OpPC,
-                                  const InterpFrame *Frame,
-                                  const CallExpr *Call) {
-  APSInt Val = popToAPSInt(S, Call->getArg(0));
-  pushInteger(S, Val.getBitWidth() - Val.getSignificantBits(), Call->getType());
-  return true;
-}
-
-static bool interp__builtin_bitreverse(InterpState &S, CodePtr OpPC,
-                                       const InterpFrame *Frame,
-                                       const CallExpr *Call) {
-  APSInt Val = popToAPSInt(S, Call->getArg(0));
-  pushInteger(S, Val.reverseBits(), Call->getType());
-  return true;
-}
-
 static bool interp__builtin_classify_type(InterpState &S, CodePtr OpPC,
                                           const InterpFrame *Frame,
                                           const CallExpr *Call) {
@@ -736,16 +712,6 @@ static bool interp__builtin_expect(InterpState &S, CodePtr OpPC,
   return true;
 }
 
-static bool interp__builtin_ffs(InterpState &S, CodePtr OpPC,
-                                const InterpFrame *Frame,
-                                const CallExpr *Call) {
-  APSInt Value = popToAPSInt(S, Call->getArg(0));
-
-  uint64_t N = Value.countr_zero();
-  pushInteger(S, N == Value.getBitWidth() ? 0 : N + 1, Call->getType());
-  return true;
-}
-
 static bool interp__builtin_addressof(InterpState &S, CodePtr OpPC,
                                       const InterpFrame *Frame,
                                       const CallExpr *Call) {
@@ -2583,6 +2549,44 @@ static bool interp__builtin_elementwise_maxmin(InterpState &S, CodePtr OpPC,
   return true;
 }
 
+static bool interp__builtin_ia32_pmadd(
+    InterpState &S, CodePtr OpPC, const CallExpr *Call,
+    llvm::function_ref<APInt(const APSInt &, const APSInt &, const APSInt &,
+                             const APSInt &)>
+        Fn) {
+  assert(Call->getArg(0)->getType()->isVectorType() &&
+         Call->getArg(1)->getType()->isVectorType());
+  const Pointer &RHS = S.Stk.pop<Pointer>();
+  const Pointer &LHS = S.Stk.pop<Pointer>();
+  const Pointer &Dst = S.Stk.peek<Pointer>();
+
+  const auto *VT = Call->getArg(0)->getType()->castAs<VectorType>();
+  PrimType ElemT = *S.getContext().classify(VT->getElementType());
+  unsigned NumElems = VT->getNumElements();
+  const auto *DestVT = Call->getType()->castAs<VectorType>();
+  PrimType DestElemT = *S.getContext().classify(DestVT->getElementType());
+  bool DestUnsigned = Call->getType()->isUnsignedIntegerOrEnumerationType();
+
+  unsigned DstElem = 0;
+  for (unsigned I = 0; I != NumElems; I += 2) {
+    APSInt Result;
+    INT_TYPE_SWITCH_NO_BOOL(ElemT, {
+      APSInt LoLHS = LHS.elem<T>(I).toAPSInt();
+      APSInt HiLHS = LHS.elem<T>(I + 1).toAPSInt();
+      APSInt LoRHS = RHS.elem<T>(I).toAPSInt();
+      APSInt HiRHS = RHS.elem<T>(I + 1).toAPSInt();
+      Result = APSInt(Fn(LoLHS, HiLHS, LoRHS, HiRHS), DestUnsigned);
+    });
+
+    INT_TYPE_SWITCH_NO_BOOL(DestElemT,
+                            { Dst.elem<T>(DstElem) = static_cast<T>(Result); });
+    ++DstElem;
+  }
+
+  Dst.initializeAllElements();
+  return true;
+}
+
 static bool interp__builtin_ia32_pmul(InterpState &S, CodePtr OpPC,
                                       const CallExpr *Call,
                                       unsigned BuiltinID) {
@@ -3158,18 +3162,25 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
   case Builtin::BI__builtin_parity:
   case Builtin::BI__builtin_parityl:
   case Builtin::BI__builtin_parityll:
-    return interp__builtin_parity(S, OpPC, Frame, Call);
-
+    return interp__builtin_elementwise_int_unaryop(
+        S, OpPC, Call, [](const APSInt &Val) -> APInt {
+          return APInt(Val.getBitWidth(), Val.popcount() % 2);
+        });
   case Builtin::BI__builtin_clrsb:
   case Builtin::BI__builtin_clrsbl:
   case Builtin::BI__builtin_clrsbll:
-    return interp__builtin_clrsb(S, OpPC, Frame, Call);
-
+    return interp__builtin_elementwise_int_unaryop(
+        S, OpPC, Call, [](const APSInt &Val) -> APInt {
+          return APInt(Val.getBitWidth(),
+                       Val.getBitWidth() - Val.getSignificantBits());
+        });
   case Builtin::BI__builtin_bitreverse8:
   case Builtin::BI__builtin_bitreverse16:
   case Builtin::BI__builtin_bitreverse32:
   case Builtin::BI__builtin_bitreverse64:
-    return interp__builtin_bitreverse(S, OpPC, Frame, Call);
+    return interp__builtin_elementwise_int_unaryop(
+        S, OpPC, Call,
+        [](const APSInt &Val) -> APInt { return Val.reverseBits(); });
 
   case Builtin::BI__builtin_classify_type:
     return interp__builtin_classify_type(S, OpPC, Frame, Call);
@@ -3209,7 +3220,11 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
   case Builtin::BI__builtin_ffs:
   case Builtin::BI__builtin_ffsl:
   case Builtin::BI__builtin_ffsll:
-    return interp__builtin_ffs(S, OpPC, Frame, Call);
+    return interp__builtin_elementwise_int_unaryop(
+        S, OpPC, Call, [](const APSInt &Val) {
+          return APInt(Val.getBitWidth(),
+                       Val.isZero() ? 0u : Val.countTrailingZeros() + 1u);
+        });
 
   case Builtin::BIaddressof:
   case Builtin::BI__addressof:
@@ -3494,6 +3509,30 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
     return interp__builtin_elementwise_int_binop(S, OpPC, Call,
                                                  llvm::APIntOps::avgCeilU);
 
+  case clang::X86::BI__builtin_ia32_pmaddubsw128:
+  case clang::X86::BI__builtin_ia32_pmaddubsw256:
+  case clang::X86::BI__builtin_ia32_pmaddubsw512:
+    return interp__builtin_ia32_pmadd(
+        S, OpPC, Call,
+        [](const APSInt &LoLHS, const APSInt &HiLHS, const APSInt &LoRHS,
+           const APSInt &HiRHS) {
+          unsigned BitWidth = 2 * LoLHS.getBitWidth();
+          return (LoLHS.zext(BitWidth) * LoRHS.sext(BitWidth))
+              .sadd_sat((HiLHS.zext(BitWidth) * HiRHS.sext(BitWidth)));
+        });
+
+  case clang::X86::BI__builtin_ia32_pmaddwd128:
+  case clang::X86::BI__builtin_ia32_pmaddwd256:
+  case clang::X86::BI__builtin_ia32_pmaddwd512:
+    return interp__builtin_ia32_pmadd(
+        S, OpPC, Call,
+        [](const APSInt &LoLHS, const APSInt &HiLHS, const APSInt &LoRHS,
+           const APSInt &HiRHS) {
+          unsigned BitWidth = 2 * LoLHS.getBitWidth();
+          return (LoLHS.sext(BitWidth) * LoRHS.sext(BitWidth)) +
+                 (HiLHS.sext(BitWidth) * HiRHS.sext(BitWidth));
+        });
+
   case clang::X86::BI__builtin_ia32_pmulhuw128:
   case clang::X86::BI__builtin_ia32_pmulhuw256:
   case clang::X86::BI__builtin_ia32_pmulhuw512:
diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index 618e163..35a866e 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -11778,6 +11778,54 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
   case clang::X86::BI__builtin_ia32_pavgw512:
     return EvaluateBinOpExpr(llvm::APIntOps::avgCeilU);
 
+  case clang::X86::BI__builtin_ia32_pmaddubsw128:
+  case clang::X86::BI__builtin_ia32_pmaddubsw256:
+  case clang::X86::BI__builtin_ia32_pmaddubsw512:
+  case clang::X86::BI__builtin_ia32_pmaddwd128:
+  case clang::X86::BI__builtin_ia32_pmaddwd256:
+  case clang::X86::BI__builtin_ia32_pmaddwd512: {
+    APValue SourceLHS, SourceRHS;
+    if (!EvaluateAsRValue(Info, E->getArg(0), SourceLHS) ||
+        !EvaluateAsRValue(Info, E->getArg(1), SourceRHS))
+      return false;
+
+    auto *DestTy = E->getType()->castAs<VectorType>();
+    QualType DestEltTy = DestTy->getElementType();
+    unsigned SourceLen = SourceLHS.getVectorLength();
+    bool DestUnsigned = DestEltTy->isUnsignedIntegerOrEnumerationType();
+    SmallVector<APValue, 4> ResultElements;
+    ResultElements.reserve(SourceLen / 2);
+
+    for (unsigned EltNum = 0; EltNum < SourceLen; EltNum += 2) {
+      const APSInt &LoLHS = SourceLHS.getVectorElt(EltNum).getInt();
+      const APSInt &HiLHS = SourceLHS.getVectorElt(EltNum + 1).getInt();
+      const APSInt &LoRHS = SourceRHS.getVectorElt(EltNum).getInt();
+      const APSInt &HiRHS = SourceRHS.getVectorElt(EltNum + 1).getInt();
+      unsigned BitWidth = 2 * LoLHS.getBitWidth();
+
+      switch (E->getBuiltinCallee()) {
+      case clang::X86::BI__builtin_ia32_pmaddubsw128:
+      case clang::X86::BI__builtin_ia32_pmaddubsw256:
+      case clang::X86::BI__builtin_ia32_pmaddubsw512:
+        ResultElements.push_back(APValue(
+            APSInt((LoLHS.zext(BitWidth) * LoRHS.sext(BitWidth))
+                       .sadd_sat((HiLHS.zext(BitWidth) * HiRHS.sext(BitWidth))),
+                   DestUnsigned)));
+        break;
+      case clang::X86::BI__builtin_ia32_pmaddwd128:
+      case clang::X86::BI__builtin_ia32_pmaddwd256:
+      case clang::X86::BI__builtin_ia32_pmaddwd512:
+        ResultElements.push_back(
+            APValue(APSInt((LoLHS.sext(BitWidth) * LoRHS.sext(BitWidth)) +
+                               (HiLHS.sext(BitWidth) * HiRHS.sext(BitWidth)),
+                           DestUnsigned)));
+        break;
+      }
+    }
+
+    return Success(APValue(ResultElements.data(), ResultElements.size()), E);
+  }
+
   case clang::X86::BI__builtin_ia32_pmulhuw128:
   case clang::X86::BI__builtin_ia32_pmulhuw256:
   case clang::X86::BI__builtin_ia32_pmulhuw512:
diff --git a/clang/lib/Analysis/UnsafeBufferUsage.cpp b/clang/lib/Analysis/UnsafeBufferUsage.cpp
index ad3d234..f5a3686 100644
--- a/clang/lib/Analysis/UnsafeBufferUsage.cpp
+++ b/clang/lib/Analysis/UnsafeBufferUsage.cpp
@@ -13,6 +13,7 @@
 #include "clang/AST/Attr.h"
 #include "clang/AST/Decl.h"
 #include "clang/AST/DeclCXX.h"
+#include "clang/AST/DeclTemplate.h"
 #include "clang/AST/DynamicRecursiveASTVisitor.h"
 #include "clang/AST/Expr.h"
 #include "clang/AST/FormatString.h"
@@ -1318,6 +1319,97 @@ static bool isSupportedVariable(const DeclRefExpr &Node) {
   return D != nullptr && isa<VarDecl>(D);
 }
 
+// Returns true for RecordDecl of type std::unique_ptr<T[]>
+static bool isUniquePtrArray(const CXXRecordDecl *RecordDecl) {
+  if (!RecordDecl || !RecordDecl->isInStdNamespace() ||
+      RecordDecl->getNameAsString() != "unique_ptr")
+    return false;
+
+  const ClassTemplateSpecializationDecl *class_template_specialization_decl =
+      dyn_cast<ClassTemplateSpecializationDecl>(RecordDecl);
+  if (!class_template_specialization_decl)
+    return false;
+
+  const TemplateArgumentList &template_args =
+      class_template_specialization_decl->getTemplateArgs();
+  if (template_args.size() == 0)
+    return false;
+
+  const TemplateArgument &first_arg = template_args[0];
+  if (first_arg.getKind() != TemplateArgument::Type)
+    return false;
+
+  QualType referred_type = first_arg.getAsType();
+  return referred_type->isArrayType();
+}
+
+class UniquePtrArrayAccessGadget : public WarningGadget {
+private:
+  static constexpr const char *const AccessorTag = "unique_ptr_array_access";
+  const CXXOperatorCallExpr *AccessorExpr;
+
+public:
+  UniquePtrArrayAccessGadget(const MatchResult &Result)
+      : WarningGadget(Kind::UniquePtrArrayAccess),
+        AccessorExpr(Result.getNodeAs<CXXOperatorCallExpr>(AccessorTag)) {
+    assert(AccessorExpr &&
+           "UniquePtrArrayAccessGadget requires a matched CXXOperatorCallExpr");
+  }
+
+  static bool classof(const Gadget *G) {
+    return G->getKind() == Kind::UniquePtrArrayAccess;
+  }
+
+  static bool matches(const Stmt *S, const ASTContext &Ctx,
+                      MatchResult &Result) {
+
+    const CXXOperatorCallExpr *OpCall = dyn_cast<CXXOperatorCallExpr>(S);
+    if (!OpCall || OpCall->getOperator() != OO_Subscript)
+      return false;
+
+    const Expr *Callee = OpCall->getCallee()->IgnoreParenImpCasts();
+    if (!Callee)
+      return false;
+
+    const CXXMethodDecl *Method =
+        dyn_cast_or_null<CXXMethodDecl>(OpCall->getDirectCallee());
+    if (!Method)
+      return false;
+
+    if (Method->getOverloadedOperator() != OO_Subscript)
+      return false;
+
+    const CXXRecordDecl *RecordDecl = Method->getParent();
+    if (!isUniquePtrArray(RecordDecl))
+      return false;
+
+    const Expr *IndexExpr = OpCall->getArg(1);
+    clang::Expr::EvalResult Eval;
+
+    // Allow [0]
+    if (IndexExpr->EvaluateAsInt(Eval, Ctx) && Eval.Val.getInt().isZero())
+      return false;
+
+    Result.addNode(AccessorTag, DynTypedNode::create(*OpCall));
+    return true;
+  }
+  void handleUnsafeOperation(UnsafeBufferUsageHandler &Handler,
+                             bool IsRelatedToDecl,
+                             ASTContext &Ctx) const override {
+    Handler.handleUnsafeUniquePtrArrayAccess(
+        DynTypedNode::create(*AccessorExpr), IsRelatedToDecl, Ctx);
+  }
+
+  SourceLocation getSourceLoc() const override {
+    if (AccessorExpr)
+      return AccessorExpr->getOperatorLoc();
+    return SourceLocation();
+  }
+
+  DeclUseList getClaimedVarUseSites() const override { return {}; }
+  SmallVector<const Expr *, 1> getUnsafePtrs() const override { return {}; }
+};
+
 using FixableGadgetList = std::vector<std::unique_ptr<FixableGadget>>;
 using WarningGadgetList = std::vector<std::unique_ptr<WarningGadget>>;
 
@@ -2632,10 +2724,13 @@ std::set<const Expr *> clang::findUnsafePointers(const FunctionDecl *FD) {
                                    const VariableGroupsManager &, FixItList &&,
                                    const Decl *,
                                    const FixitStrategy &) override {}
-    bool isSafeBufferOptOut(const SourceLocation &) const override {
+    void handleUnsafeUniquePtrArrayAccess(const DynTypedNode &Node,
+                                          bool IsRelatedToDecl,
+                                          ASTContext &Ctx) override {}
+    bool ignoreUnsafeBufferInContainer(const SourceLocation &) const override {
       return false;
     }
-    bool ignoreUnsafeBufferInContainer(const SourceLocation &) const override {
+    bool isSafeBufferOptOut(const SourceLocation &) const override {
       return false;
     }
     bool ignoreUnsafeBufferInLibcCall(const SourceLocation &) const override {
diff --git a/clang/lib/Basic/Diagnostic.cpp b/clang/lib/Basic/Diagnostic.cpp
index 2b89370..a955c3b 100644
--- a/clang/lib/Basic/Diagnostic.cpp
+++ b/clang/lib/Basic/Diagnostic.cpp
@@ -517,12 +517,6 @@ public:
                         const SourceManager &SM) const;
 
 private:
-  // Find the longest glob pattern that matches FilePath amongst
-  // CategoriesToMatchers, return true iff the match exists and belongs to a
-  // positive category.
-  bool globsMatches(const llvm::StringMap<Matcher> &CategoriesToMatchers,
-                    StringRef FilePath) const;
-
   llvm::DenseMap<diag::kind, const Section *> DiagToSection;
 };
 } // namespace
@@ -584,43 +578,24 @@ void DiagnosticsEngine::setDiagSuppressionMapping(llvm::MemoryBuffer &Input) {
 bool WarningsSpecialCaseList::isDiagSuppressed(diag::kind DiagId,
                                                SourceLocation DiagLoc,
                                                const SourceManager &SM) const {
+  PresumedLoc PLoc = SM.getPresumedLoc(DiagLoc);
+  if (!PLoc.isValid())
+    return false;
   const Section *DiagSection = DiagToSection.lookup(DiagId);
   if (!DiagSection)
     return false;
-  const SectionEntries &EntityTypeToCategories = DiagSection->Entries;
-  auto SrcEntriesIt = EntityTypeToCategories.find("src");
-  if (SrcEntriesIt == EntityTypeToCategories.end())
+
+  StringRef F = llvm::sys::path::remove_leading_dotslash(PLoc.getFilename());
+
+  StringRef LongestSup = DiagSection->getLongestMatch("src", F, "");
+  if (LongestSup.empty())
     return false;
-  const llvm::StringMap<llvm::SpecialCaseList::Matcher> &CategoriesToMatchers =
-      SrcEntriesIt->getValue();
-  // We also use presumed locations here to improve reproducibility for
-  // preprocessed inputs.
-  if (PresumedLoc PLoc = SM.getPresumedLoc(DiagLoc); PLoc.isValid())
-    return globsMatches(
-        CategoriesToMatchers,
-        llvm::sys::path::remove_leading_dotslash(PLoc.getFilename()));
-  return false;
-}
 
-bool WarningsSpecialCaseList::globsMatches(
-    const llvm::StringMap<Matcher> &CategoriesToMatchers,
-    StringRef FilePath) const {
-  StringRef LongestMatch;
-  bool LongestIsPositive = false;
-  for (const auto &Entry : CategoriesToMatchers) {
-    StringRef Category = Entry.getKey();
-    const llvm::SpecialCaseList::Matcher &Matcher = Entry.getValue();
-    bool IsPositive = Category != "emit";
-    for (const auto &Glob : Matcher.Globs) {
-      if (Glob->Name.size() < LongestMatch.size())
-        continue;
-      if (!Glob->Pattern.match(FilePath))
-        continue;
-      LongestMatch = Glob->Name;
-      LongestIsPositive = IsPositive;
-    }
-  }
-  return LongestIsPositive;
+  StringRef LongestEmit = DiagSection->getLongestMatch("src", F, "emit");
+  if (LongestEmit.empty())
+    return true;
+
+  return LongestSup.size() > LongestEmit.size();
 }
 
 bool DiagnosticsEngine::isSuppressedViaMapping(diag::kind DiagId,
diff --git a/clang/lib/CIR/CodeGen/CIRGenClass.cpp b/clang/lib/CIR/CodeGen/CIRGenClass.cpp
index 8f4377b..d9ebf19 100644
--- a/clang/lib/CIR/CodeGen/CIRGenClass.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenClass.cpp
@@ -870,6 +870,109 @@ void CIRGenFunction::destroyCXXObject(CIRGenFunction &cgf, Address addr,
                             /*delegating=*/false, addr, type);
 }
 
+namespace {
+class DestroyField final : public EHScopeStack::Cleanup {
+  const FieldDecl *field;
+  CIRGenFunction::Destroyer *destroyer;
+
+public:
+  DestroyField(const FieldDecl *field, CIRGenFunction::Destroyer *destroyer)
+      : field(field), destroyer(destroyer) {}
+
+  void emit(CIRGenFunction &cgf) override {
+    // Find the address of the field.
+    Address thisValue = cgf.loadCXXThisAddress();
+    CanQualType recordTy =
+        cgf.getContext().getCanonicalTagType(field->getParent());
+    LValue thisLV = cgf.makeAddrLValue(thisValue, recordTy);
+    LValue lv = cgf.emitLValueForField(thisLV, field);
+    assert(lv.isSimple());
+
+    assert(!cir::MissingFeatures::ehCleanupFlags());
+    cgf.emitDestroy(lv.getAddress(), field->getType(), destroyer);
+  }
+
+  // This is a placeholder until EHCleanupScope is implemented.
+  size_t getSize() const override {
+    assert(!cir::MissingFeatures::ehCleanupScope());
+    return sizeof(DestroyField);
+  }
+};
+} // namespace
+
+/// Emit all code that comes at the end of class's destructor. This is to call
+/// destructors on members and base classes in reverse order of their
+/// construction.
+///
+/// For a deleting destructor, this also handles the case where a destroying
+/// operator delete completely overrides the definition.
+void CIRGenFunction::enterDtorCleanups(const CXXDestructorDecl *dd,
+                                       CXXDtorType dtorType) {
+  assert((!dd->isTrivial() || dd->hasAttr<DLLExportAttr>()) &&
+         "Should not emit dtor epilogue for non-exported trivial dtor!");
+
+  // The deleting-destructor phase just needs to call the appropriate
+  // operator delete that Sema picked up.
+  if (dtorType == Dtor_Deleting) {
+    cgm.errorNYI(dd->getSourceRange(), "deleting destructor cleanups");
+    return;
+  }
+
+  const CXXRecordDecl *classDecl = dd->getParent();
+
+  // Unions have no bases and do not call field destructors.
+  if (classDecl->isUnion())
+    return;
+
+  // The complete-destructor phase just destructs all the virtual bases.
+  if (dtorType == Dtor_Complete) {
+    assert(!cir::MissingFeatures::sanitizers());
+
+    if (classDecl->getNumVBases())
+      cgm.errorNYI(dd->getSourceRange(), "virtual base destructor cleanups");
+
+    return;
+  }
+
+  assert(dtorType == Dtor_Base);
+  assert(!cir::MissingFeatures::sanitizers());
+
+  // Destroy non-virtual bases.
+  for (const CXXBaseSpecifier &base : classDecl->bases()) {
+    // Ignore virtual bases.
+    if (base.isVirtual())
+      continue;
+
+    CXXRecordDecl *baseClassDecl = base.getType()->getAsCXXRecordDecl();
+
+    if (baseClassDecl->hasTrivialDestructor())
+      assert(!cir::MissingFeatures::sanitizers());
+    else
+      cgm.errorNYI(dd->getSourceRange(),
+                   "non-trivial base destructor cleanups");
+  }
+
+  assert(!cir::MissingFeatures::sanitizers());
+
+  // Destroy direct fields.
+  for (const FieldDecl *field : classDecl->fields()) {
+    QualType type = field->getType();
+    QualType::DestructionKind dtorKind = type.isDestructedType();
+    if (!dtorKind)
+      continue;
+
+    // Anonymous union members do not have their destructors called.
+    const RecordType *rt = type->getAsUnionType();
+    if (rt && rt->getOriginalDecl()->isAnonymousStructOrUnion())
+      continue;
+
+    CleanupKind cleanupKind = getCleanupKind(dtorKind);
+    assert(!cir::MissingFeatures::ehCleanupFlags());
+    ehStack.pushCleanup<DestroyField>(cleanupKind, field,
+                                      getDestroyer(dtorKind));
+  }
+}
+
 void CIRGenFunction::emitDelegatingCXXConstructorCall(
     const CXXConstructorDecl *ctor, const FunctionArgList &args) {
   assert(ctor->isDelegatingConstructor());
diff --git a/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp b/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp
index 5d3496a..7edd83e 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp
@@ -1893,6 +1893,28 @@ mlir::Value ScalarExprEmitter::VisitCastExpr(CastExpr *ce) {
     }
     return v;
   }
+  case CK_IntegralToPointer: {
+    mlir::Type destCIRTy = cgf.convertType(destTy);
+    mlir::Value src = Visit(const_cast<Expr *>(subExpr));
+
+    // Properly resize by casting to an int of the same size as the pointer.
+    // Clang's IntegralToPointer includes 'bool' as the source, but in CIR
+    // 'bool' is not an integral type.  So check the source type to get the
+    // correct CIR conversion.
+    mlir::Type middleTy = cgf.cgm.getDataLayout().getIntPtrType(destCIRTy);
+    mlir::Value middleVal = builder.createCast(
+        subExpr->getType()->isBooleanType() ? cir::CastKind::bool_to_int
+                                            : cir::CastKind::integral,
+        src, middleTy);
+
+    if (cgf.cgm.getCodeGenOpts().StrictVTablePointers) {
+      cgf.cgm.errorNYI(subExpr->getSourceRange(),
+                       "IntegralToPointer: strict vtable pointers");
+      return {};
+    }
+
+    return builder.createIntToPtr(middleVal, destCIRTy);
+  }
 
   case CK_ArrayToPointerDecay:
     return cgf.emitArrayToPointerDecay(subExpr).getPointer();
diff --git a/clang/lib/CIR/CodeGen/CIRGenFunction.cpp b/clang/lib/CIR/CodeGen/CIRGenFunction.cpp
index 52fb0d7..7a774e0 100644
--- a/clang/lib/CIR/CodeGen/CIRGenFunction.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenFunction.cpp
@@ -689,7 +689,9 @@ void CIRGenFunction::emitDestructorBody(FunctionArgList &args) {
     cgm.errorNYI(dtor->getSourceRange(), "function-try-block destructor");
 
   assert(!cir::MissingFeatures::sanitizers());
-  assert(!cir::MissingFeatures::dtorCleanups());
+
+  // Enter the epilogue cleanups.
+  RunCleanupsScope dtorEpilogue(*this);
 
   // If this is the complete variant, just invoke the base variant;
   // the epilogue will destruct the virtual bases.  But we can't do
@@ -708,7 +710,8 @@ void CIRGenFunction::emitDestructorBody(FunctionArgList &args) {
     assert((body || getTarget().getCXXABI().isMicrosoft()) &&
            "can't emit a dtor without a body for non-Microsoft ABIs");
 
-    assert(!cir::MissingFeatures::dtorCleanups());
+    // Enter the cleanup scopes for virtual bases.
+    enterDtorCleanups(dtor, Dtor_Complete);
 
     if (!isTryBody) {
       QualType thisTy = dtor->getFunctionObjectParameterType();
@@ -723,7 +726,9 @@ void CIRGenFunction::emitDestructorBody(FunctionArgList &args) {
   case Dtor_Base:
     assert(body);
 
-    assert(!cir::MissingFeatures::dtorCleanups());
+    // Enter the cleanup scopes for fields and non-virtual bases.
+    enterDtorCleanups(dtor, Dtor_Base);
+
     assert(!cir::MissingFeatures::vtableInitialization());
 
     if (isTryBody) {
@@ -741,7 +746,8 @@ void CIRGenFunction::emitDestructorBody(FunctionArgList &args) {
     break;
   }
 
-  assert(!cir::MissingFeatures::dtorCleanups());
+  // Jump out through the epilogue cleanups.
+  dtorEpilogue.forceCleanup();
 
   // Exit the try if applicable.
   if (isTryBody)
diff --git a/clang/lib/CIR/CodeGen/CIRGenFunction.h b/clang/lib/CIR/CodeGen/CIRGenFunction.h
index a60efe1..db2adc2 100644
--- a/clang/lib/CIR/CodeGen/CIRGenFunction.h
+++ b/clang/lib/CIR/CodeGen/CIRGenFunction.h
@@ -556,6 +556,33 @@ public:
                                               cir::GlobalOp gv,
                                               cir::GetGlobalOp gvAddr);
 
+  /// Enter the cleanups necessary to complete the given phase of destruction
+  /// for a destructor. The end result should call destructors on members and
+  /// base classes in reverse order of their construction.
+  void enterDtorCleanups(const CXXDestructorDecl *dtor, CXXDtorType type);
+
+  /// Determines whether an EH cleanup is required to destroy a type
+  /// with the given destruction kind.
+  /// TODO(cir): could be shared with Clang LLVM codegen
+  bool needsEHCleanup(QualType::DestructionKind kind) {
+    switch (kind) {
+    case QualType::DK_none:
+      return false;
+    case QualType::DK_cxx_destructor:
+    case QualType::DK_objc_weak_lifetime:
+    case QualType::DK_nontrivial_c_struct:
+      return getLangOpts().Exceptions;
+    case QualType::DK_objc_strong_lifetime:
+      return getLangOpts().Exceptions &&
+             cgm.getCodeGenOpts().ObjCAutoRefCountExceptions;
+    }
+    llvm_unreachable("bad destruction kind");
+  }
+
+  CleanupKind getCleanupKind(QualType::DestructionKind kind) {
+    return needsEHCleanup(kind) ? NormalAndEHCleanup : NormalCleanup;
+  }
+
   /// Set the address of a local variable.
   void setAddrOfLocalVar(const clang::VarDecl *vd, Address addr) {
     assert(!localDeclMap.count(vd) && "Decl already exists in LocalDeclMap!");
diff --git a/clang/lib/CIR/CodeGen/CIRGenOpenACC.cpp b/clang/lib/CIR/CodeGen/CIRGenOpenACC.cpp
index a9af753..4cf2237 100644
--- a/clang/lib/CIR/CodeGen/CIRGenOpenACC.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenOpenACC.cpp
@@ -87,7 +87,10 @@ CIRGenFunction::getOpenACCDataOperandInfo(const Expr *e) {
     if (const auto *section = dyn_cast<ArraySectionExpr>(curVarExpr)) {
       QualType baseTy = ArraySectionExpr::getBaseOriginalType(
           section->getBase()->IgnoreParenImpCasts());
-      boundTypes.push_back(QualType(baseTy->getPointeeOrArrayElementType(), 0));
+      if (auto *at = getContext().getAsArrayType(baseTy))
+        boundTypes.push_back(at->getElementType());
+      else
+        boundTypes.push_back(baseTy->getPointeeType());
     } else {
       boundTypes.push_back(curVarExpr->getType());
     }
diff --git a/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp b/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp
index 2eeef81..bc917d0 100644
--- a/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp
+++ b/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp
@@ -61,6 +61,9 @@ struct LoweringPreparePass : public LoweringPrepareBase<LoweringPreparePass> {
   /// Build a module init function that calls all the dynamic initializers.
   void buildCXXGlobalInitFunc();
 
+  /// Materialize global ctor/dtor list
+  void buildGlobalCtorDtorList();
+
   cir::FuncOp buildRuntimeFunction(
       mlir::OpBuilder &builder, llvm::StringRef name, mlir::Location loc,
       cir::FuncType type,
@@ -79,6 +82,9 @@ struct LoweringPreparePass : public LoweringPrepareBase<LoweringPreparePass> {
   llvm::StringMap<uint32_t> dynamicInitializerNames;
   llvm::SmallVector<cir::FuncOp> dynamicInitializers;
 
+  /// List of ctors and their priorities to be called before main()
+  llvm::SmallVector<std::pair<std::string, uint32_t>, 4> globalCtorList;
+
   void setASTContext(clang::ASTContext *c) { astCtx = c; }
 };
 
@@ -689,11 +695,36 @@ void LoweringPreparePass::lowerGlobalOp(GlobalOp op) {
   assert(!cir::MissingFeatures::opGlobalAnnotations());
 }
 
+template <typename AttributeTy>
+static llvm::SmallVector<mlir::Attribute>
+prepareCtorDtorAttrList(mlir::MLIRContext *context,
+                        llvm::ArrayRef<std::pair<std::string, uint32_t>> list) {
+  llvm::SmallVector<mlir::Attribute> attrs;
+  for (const auto &[name, priority] : list)
+    attrs.push_back(AttributeTy::get(context, name, priority));
+  return attrs;
+}
+
+void LoweringPreparePass::buildGlobalCtorDtorList() {
+  if (!globalCtorList.empty()) {
+    llvm::SmallVector<mlir::Attribute> globalCtors =
+        prepareCtorDtorAttrList<cir::GlobalCtorAttr>(&getContext(),
+                                                     globalCtorList);
+
+    mlirModule->setAttr(cir::CIRDialect::getGlobalCtorsAttrName(),
+                        mlir::ArrayAttr::get(&getContext(), globalCtors));
+  }
+
+  assert(!cir::MissingFeatures::opGlobalDtorLowering());
+}
+
 void LoweringPreparePass::buildCXXGlobalInitFunc() {
   if (dynamicInitializers.empty())
     return;
 
-  assert(!cir::MissingFeatures::opGlobalCtorList());
+  // TODO: handle globals with a user-specified initialzation priority.
+  // TODO: handle default priority more nicely.
+  assert(!cir::MissingFeatures::opGlobalCtorPriority());
 
   SmallString<256> fnName;
   // Include the filename in the symbol name. Including "sub_" matches gcc
@@ -722,6 +753,10 @@ void LoweringPreparePass::buildCXXGlobalInitFunc() {
   builder.setInsertionPointToStart(f.addEntryBlock());
   for (cir::FuncOp &f : dynamicInitializers)
     builder.createCallOp(f.getLoc(), f, {});
+  // Add the global init function (not the individual ctor functions) to the
+  // global ctor list.
+  globalCtorList.emplace_back(fnName,
+                              cir::GlobalCtorAttr::getDefaultPriority());
 
   cir::ReturnOp::create(builder, f.getLoc());
 }
@@ -852,6 +887,7 @@ void LoweringPreparePass::runOnOperation() {
     runOnOp(o);
 
   buildCXXGlobalInitFunc();
+  buildGlobalCtorDtorList();
 }
 
 std::unique_ptr<Pass> mlir::createLoweringPreparePass() {
diff --git a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
index e9649af..a80a295 100644
--- a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
+++ b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
@@ -2413,6 +2413,73 @@ static void prepareTypeConverter(mlir::LLVMTypeConverter &converter,
   });
 }
 
+static void buildCtorDtorList(
+    mlir::ModuleOp module, StringRef globalXtorName, StringRef llvmXtorName,
+    llvm::function_ref<std::pair<StringRef, int>(mlir::Attribute)> createXtor) {
+  llvm::SmallVector<std::pair<StringRef, int>> globalXtors;
+  for (const mlir::NamedAttribute namedAttr : module->getAttrs()) {
+    if (namedAttr.getName() == globalXtorName) {
+      for (auto attr : mlir::cast<mlir::ArrayAttr>(namedAttr.getValue()))
+        globalXtors.emplace_back(createXtor(attr));
+      break;
+    }
+  }
+
+  if (globalXtors.empty())
+    return;
+
+  mlir::OpBuilder builder(module.getContext());
+  builder.setInsertionPointToEnd(&module.getBodyRegion().back());
+
+  // Create a global array llvm.global_ctors with element type of
+  // struct { i32, ptr, ptr }
+  auto ctorPFTy = mlir::LLVM::LLVMPointerType::get(builder.getContext());
+  llvm::SmallVector<mlir::Type> ctorStructFields;
+  ctorStructFields.push_back(builder.getI32Type());
+  ctorStructFields.push_back(ctorPFTy);
+  ctorStructFields.push_back(ctorPFTy);
+
+  auto ctorStructTy = mlir::LLVM::LLVMStructType::getLiteral(
+      builder.getContext(), ctorStructFields);
+  auto ctorStructArrayTy =
+      mlir::LLVM::LLVMArrayType::get(ctorStructTy, globalXtors.size());
+
+  mlir::Location loc = module.getLoc();
+  auto newGlobalOp = mlir::LLVM::GlobalOp::create(
+      builder, loc, ctorStructArrayTy, /*constant=*/false,
+      mlir::LLVM::Linkage::Appending, llvmXtorName, mlir::Attribute());
+
+  builder.createBlock(&newGlobalOp.getRegion());
+  builder.setInsertionPointToEnd(newGlobalOp.getInitializerBlock());
+
+  mlir::Value result =
+      mlir::LLVM::UndefOp::create(builder, loc, ctorStructArrayTy);
+
+  for (auto [index, fn] : llvm::enumerate(globalXtors)) {
+    mlir::Value structInit =
+        mlir::LLVM::UndefOp::create(builder, loc, ctorStructTy);
+    mlir::Value initPriority = mlir::LLVM::ConstantOp::create(
+        builder, loc, ctorStructFields[0], fn.second);
+    mlir::Value initFuncAddr = mlir::LLVM::AddressOfOp::create(
+        builder, loc, ctorStructFields[1], fn.first);
+    mlir::Value initAssociate =
+        mlir::LLVM::ZeroOp::create(builder, loc, ctorStructFields[2]);
+    // Literal zero makes the InsertValueOp::create ambiguous.
+    llvm::SmallVector<int64_t> zero{0};
+    structInit = mlir::LLVM::InsertValueOp::create(builder, loc, structInit,
+                                                   initPriority, zero);
+    structInit = mlir::LLVM::InsertValueOp::create(builder, loc, structInit,
+                                                   initFuncAddr, 1);
+    // TODO: handle associated data for initializers.
+    structInit = mlir::LLVM::InsertValueOp::create(builder, loc, structInit,
+                                                   initAssociate, 2);
+    result = mlir::LLVM::InsertValueOp::create(builder, loc, result, structInit,
+                                               index);
+  }
+
+  builder.create<mlir::LLVM::ReturnOp>(loc, result);
+}
+
 // The applyPartialConversion function traverses blocks in the dominance order,
 // so it does not lower and operations that are not reachachable from the
 // operations passed in as arguments. Since we do need to lower such code in
@@ -2519,6 +2586,14 @@ void ConvertCIRToLLVMPass::runOnOperation() {
 
   if (failed(applyPartialConversion(ops, target, std::move(patterns))))
     signalPassFailure();
+
+  // Emit the llvm.global_ctors array.
+  buildCtorDtorList(module, cir::CIRDialect::getGlobalCtorsAttrName(),
+                    "llvm.global_ctors", [](mlir::Attribute attr) {
+                      auto ctorAttr = mlir::cast<cir::GlobalCtorAttr>(attr);
+                      return std::make_pair(ctorAttr.getName(),
+                                            ctorAttr.getPriority());
+                    });
 }
 
 mlir::LogicalResult CIRToLLVMBrOpLowering::matchAndRewrite(
diff --git a/clang/lib/CodeGen/BackendUtil.cpp b/clang/lib/CodeGen/BackendUtil.cpp
index 64f1917..2d95982 100644
--- a/clang/lib/CodeGen/BackendUtil.cpp
+++ b/clang/lib/CodeGen/BackendUtil.cpp
@@ -60,11 +60,13 @@
 #include "llvm/TargetParser/Triple.h"
 #include "llvm/Transforms/HipStdPar/HipStdPar.h"
 #include "llvm/Transforms/IPO/EmbedBitcodePass.h"
+#include "llvm/Transforms/IPO/InferFunctionAttrs.h"
 #include "llvm/Transforms/IPO/LowerTypeTests.h"
 #include "llvm/Transforms/IPO/ThinLTOBitcodeWriter.h"
 #include "llvm/Transforms/InstCombine/InstCombine.h"
 #include "llvm/Transforms/Instrumentation/AddressSanitizer.h"
 #include "llvm/Transforms/Instrumentation/AddressSanitizerOptions.h"
+#include "llvm/Transforms/Instrumentation/AllocToken.h"
 #include "llvm/Transforms/Instrumentation/BoundsChecking.h"
 #include "llvm/Transforms/Instrumentation/DataFlowSanitizer.h"
 #include "llvm/Transforms/Instrumentation/GCOVProfiler.h"
@@ -232,6 +234,14 @@ public:
 };
 } // namespace
 
+static AllocTokenOptions getAllocTokenOptions(const CodeGenOptions &CGOpts) {
+  AllocTokenOptions Opts;
+  Opts.MaxTokens = CGOpts.AllocTokenMax;
+  Opts.Extended = CGOpts.SanitizeAllocTokenExtended;
+  Opts.FastABI = CGOpts.SanitizeAllocTokenFastABI;
+  return Opts;
+}
+
 static SanitizerCoverageOptions
 getSancovOptsFromCGOpts(const CodeGenOptions &CGOpts) {
   SanitizerCoverageOptions Opts;
@@ -789,6 +799,16 @@ static void addSanitizers(const Triple &TargetTriple,
       MPM.addPass(DataFlowSanitizerPass(LangOpts.NoSanitizeFiles,
                                         PB.getVirtualFileSystemPtr()));
     }
+
+    if (LangOpts.Sanitize.has(SanitizerKind::AllocToken)) {
+      if (Level == OptimizationLevel::O0) {
+        // The default pass builder only infers libcall function attrs when
+        // optimizing, so we insert it here because we need it for accurate
+        // memory allocation function detection.
+        MPM.addPass(InferFunctionAttrsPass());
+      }
+      MPM.addPass(AllocTokenPass(getAllocTokenOptions(CodeGenOpts)));
+    }
   };
   if (ClSanitizeOnOptimizerEarlyEP) {
     PB.registerOptimizerEarlyEPCallback(
diff --git a/clang/lib/CodeGen/CGDebugInfo.cpp b/clang/lib/CodeGen/CGDebugInfo.cpp
index fee6bc0..b91cb36 100644
--- a/clang/lib/CodeGen/CGDebugInfo.cpp
+++ b/clang/lib/CodeGen/CGDebugInfo.cpp
@@ -787,7 +787,8 @@ void CGDebugInfo::CreateCompileUnit() {
 
   // Create new compile unit.
   TheCU = DBuilder.createCompileUnit(
-      LangTag, CUFile, CGOpts.EmitVersionIdentMetadata ? Producer : "",
+      llvm::DISourceLanguageName(LangTag), CUFile,
+      CGOpts.EmitVersionIdentMetadata ? Producer : "",
       CGOpts.OptimizationLevel != 0 || CGOpts.PrepareForLTO ||
           CGOpts.PrepareForThinLTO,
       CGOpts.DwarfDebugFlags, RuntimeVers, CGOpts.SplitDwarfFile, EmissionKind,
@@ -1232,7 +1233,7 @@ llvm::DIType *CGDebugInfo::CreateType(const PointerType *Ty,
 
 /// \return whether a C++ mangling exists for the type defined by TD.
 static bool hasCXXMangling(const TagDecl *TD, llvm::DICompileUnit *TheCU) {
-  switch (TheCU->getSourceLanguage()) {
+  switch (TheCU->getSourceLanguage().getUnversionedName()) {
   case llvm::dwarf::DW_LANG_C_plus_plus:
   case llvm::dwarf::DW_LANG_C_plus_plus_11:
   case llvm::dwarf::DW_LANG_C_plus_plus_14:
@@ -3211,8 +3212,8 @@ llvm::DIType *CGDebugInfo::CreateType(const ObjCInterfaceType *Ty,
   if (!ID)
     return nullptr;
 
-  auto RuntimeLang =
-      static_cast<llvm::dwarf::SourceLanguage>(TheCU->getSourceLanguage());
+  auto RuntimeLang = static_cast<llvm::dwarf::SourceLanguage>(
+      TheCU->getSourceLanguage().getUnversionedName());
 
   // Return a forward declaration if this type was imported from a clang module,
   // and this is not the compile unit with the implementation of the type (which
@@ -3348,7 +3349,8 @@ llvm::DIType *CGDebugInfo::CreateTypeDefinition(const ObjCInterfaceType *Ty,
   ObjCInterfaceDecl *ID = Ty->getDecl();
   llvm::DIFile *DefUnit = getOrCreateFile(ID->getLocation());
   unsigned Line = getLineNumber(ID->getLocation());
-  unsigned RuntimeLang = TheCU->getSourceLanguage();
+
+  unsigned RuntimeLang = TheCU->getSourceLanguage().getUnversionedName();
 
   // Bit size, align and offset of the type.
   uint64_t Size = CGM.getContext().getTypeSize(Ty);
diff --git a/clang/lib/CodeGen/CGExpr.cpp b/clang/lib/CodeGen/CGExpr.cpp
index 9f30287..a071e80 100644
--- a/clang/lib/CodeGen/CGExpr.cpp
+++ b/clang/lib/CodeGen/CGExpr.cpp
@@ -1272,6 +1272,23 @@ void CodeGenFunction::EmitBoundsCheckImpl(const Expr *E, llvm::Value *Bound,
   EmitCheck(std::make_pair(Check, CheckKind), CheckHandler, StaticData, Index);
 }
 
+void CodeGenFunction::EmitAllocToken(llvm::CallBase *CB, QualType AllocType) {
+  assert(SanOpts.has(SanitizerKind::AllocToken) &&
+         "Only needed with -fsanitize=alloc-token");
+
+  PrintingPolicy Policy(CGM.getContext().getLangOpts());
+  Policy.SuppressTagKeyword = true;
+  Policy.FullyQualifiedName = true;
+  SmallString<64> TypeName;
+  llvm::raw_svector_ostream TypeNameOS(TypeName);
+  AllocType.getCanonicalType().print(TypeNameOS, Policy);
+  auto *TypeMDS = llvm::MDString::get(CGM.getLLVMContext(), TypeNameOS.str());
+
+  // Format: !{<type-name>}
+  auto *MDN = llvm::MDNode::get(CGM.getLLVMContext(), {TypeMDS});
+  CB->setMetadata(llvm::LLVMContext::MD_alloc_token, MDN);
+}
+
 CodeGenFunction::ComplexPairTy CodeGenFunction::
 EmitComplexPrePostIncDec(const UnaryOperator *E, LValue LV,
                          bool isInc, bool isPre) {
diff --git a/clang/lib/CodeGen/CGExprCXX.cpp b/clang/lib/CodeGen/CGExprCXX.cpp
index c52526c..290c2e0 100644
--- a/clang/lib/CodeGen/CGExprCXX.cpp
+++ b/clang/lib/CodeGen/CGExprCXX.cpp
@@ -1655,11 +1655,16 @@ llvm::Value *CodeGenFunction::EmitCXXNewExpr(const CXXNewExpr *E) {
     RValue RV =
       EmitNewDeleteCall(*this, allocator, allocatorType, allocatorArgs);
 
-    // Set !heapallocsite metadata on the call to operator new.
-    if (getDebugInfo())
-      if (auto *newCall = dyn_cast<llvm::CallBase>(RV.getScalarVal()))
-        getDebugInfo()->addHeapAllocSiteMetadata(newCall, allocType,
-                                                 E->getExprLoc());
+    if (auto *newCall = dyn_cast<llvm::CallBase>(RV.getScalarVal())) {
+      if (auto *CGDI = getDebugInfo()) {
+        // Set !heapallocsite metadata on the call to operator new.
+        CGDI->addHeapAllocSiteMetadata(newCall, allocType, E->getExprLoc());
+      }
+      if (SanOpts.has(SanitizerKind::AllocToken)) {
+        // Set !alloc_token metadata.
+        EmitAllocToken(newCall, allocType);
+      }
+    }
 
     // If this was a call to a global replaceable allocation function that does
     // not take an alignment argument, the allocator is known to produce
diff --git a/clang/lib/CodeGen/CodeGenFunction.cpp b/clang/lib/CodeGen/CodeGenFunction.cpp
index b2fe917..acf8de4 100644
--- a/clang/lib/CodeGen/CodeGenFunction.cpp
+++ b/clang/lib/CodeGen/CodeGenFunction.cpp
@@ -846,6 +846,8 @@ void CodeGenFunction::StartFunction(GlobalDecl GD, QualType RetTy,
       Fn->addFnAttr(llvm::Attribute::SanitizeNumericalStability);
     if (SanOpts.hasOneOf(SanitizerKind::Memory | SanitizerKind::KernelMemory))
       Fn->addFnAttr(llvm::Attribute::SanitizeMemory);
+    if (SanOpts.has(SanitizerKind::AllocToken))
+      Fn->addFnAttr(llvm::Attribute::SanitizeAllocToken);
   }
   if (SanOpts.has(SanitizerKind::SafeStack))
     Fn->addFnAttr(llvm::Attribute::SafeStack);
diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h
index 99de6e1..e14e60c 100644
--- a/clang/lib/CodeGen/CodeGenFunction.h
+++ b/clang/lib/CodeGen/CodeGenFunction.h
@@ -3348,6 +3348,9 @@ public:
   SanitizerAnnotateDebugInfo(ArrayRef<SanitizerKind::SanitizerOrdinal> Ordinals,
                              SanitizerHandler Handler);
 
+  /// Emit additional metadata used by the AllocToken instrumentation.
+  void EmitAllocToken(llvm::CallBase *CB, QualType AllocType);
+
   llvm::Value *GetCountedByFieldExprGEP(const Expr *Base, const FieldDecl *FD,
                                         const FieldDecl *CountDecl);
 
diff --git a/clang/lib/Driver/SanitizerArgs.cpp b/clang/lib/Driver/SanitizerArgs.cpp
index 7ce1afe..5dd48f5 100644
--- a/clang/lib/Driver/SanitizerArgs.cpp
+++ b/clang/lib/Driver/SanitizerArgs.cpp
@@ -61,8 +61,9 @@ static const SanitizerMask RecoverableByDefault =
     SanitizerKind::ImplicitConversion | SanitizerKind::Nullability |
     SanitizerKind::FloatDivideByZero | SanitizerKind::ObjCCast |
     SanitizerKind::Vptr;
-static const SanitizerMask Unrecoverable =
-    SanitizerKind::Unreachable | SanitizerKind::Return;
+static const SanitizerMask Unrecoverable = SanitizerKind::Unreachable |
+                                           SanitizerKind::Return |
+                                           SanitizerKind::AllocToken;
 static const SanitizerMask AlwaysRecoverable = SanitizerKind::KernelAddress |
                                                SanitizerKind::KernelHWAddress |
                                                SanitizerKind::KCFI;
@@ -84,7 +85,8 @@ static const SanitizerMask CFIClasses =
 static const SanitizerMask CompatibleWithMinimalRuntime =
     TrappingSupported | SanitizerKind::Scudo | SanitizerKind::ShadowCallStack |
     SanitizerKind::MemtagStack | SanitizerKind::MemtagHeap |
-    SanitizerKind::MemtagGlobals | SanitizerKind::KCFI;
+    SanitizerKind::MemtagGlobals | SanitizerKind::KCFI |
+    SanitizerKind::AllocToken;
 
 enum CoverageFeature {
   CoverageFunc = 1 << 0,
@@ -203,6 +205,7 @@ static void addDefaultIgnorelists(const Driver &D, SanitizerMask Kinds,
                      {"tysan_blacklist.txt", SanitizerKind::Type},
                      {"dfsan_abilist.txt", SanitizerKind::DataFlow},
                      {"cfi_ignorelist.txt", SanitizerKind::CFI},
+                     {"alloc_token_ignorelist.txt", SanitizerKind::AllocToken},
                      {"ubsan_ignorelist.txt",
                       SanitizerKind::Undefined | SanitizerKind::Vptr |
                           SanitizerKind::Integer | SanitizerKind::Nullability |
@@ -650,7 +653,12 @@ SanitizerArgs::SanitizerArgs(const ToolChain &TC,
       std::make_pair(SanitizerKind::KCFI, SanitizerKind::Function),
       std::make_pair(SanitizerKind::Realtime,
                      SanitizerKind::Address | SanitizerKind::Thread |
-                         SanitizerKind::Undefined | SanitizerKind::Memory)};
+                         SanitizerKind::Undefined | SanitizerKind::Memory),
+      std::make_pair(SanitizerKind::AllocToken,
+                     SanitizerKind::Address | SanitizerKind::HWAddress |
+                         SanitizerKind::KernelAddress |
+                         SanitizerKind::KernelHWAddress |
+                         SanitizerKind::Memory)};
 
   // Enable toolchain specific default sanitizers if not explicitly disabled.
   SanitizerMask Default = TC.getDefaultSanitizers() & ~AllRemove;
@@ -1159,6 +1167,15 @@ SanitizerArgs::SanitizerArgs(const ToolChain &TC,
         !TC.getTriple().isAndroid() && !TC.getTriple().isOSFuchsia();
   }
 
+  if (AllAddedKinds & SanitizerKind::AllocToken) {
+    AllocTokenFastABI = Args.hasFlag(
+        options::OPT_fsanitize_alloc_token_fast_abi,
+        options::OPT_fno_sanitize_alloc_token_fast_abi, AllocTokenFastABI);
+    AllocTokenExtended = Args.hasFlag(
+        options::OPT_fsanitize_alloc_token_extended,
+        options::OPT_fno_sanitize_alloc_token_extended, AllocTokenExtended);
+  }
+
   LinkRuntimes = Args.hasFlag(options::OPT_fsanitize_link_runtime,
                               options::OPT_fno_sanitize_link_runtime,
                               !Args.hasArg(options::OPT_r));
@@ -1527,6 +1544,12 @@ void SanitizerArgs::addArgs(const ToolChain &TC, const llvm::opt::ArgList &Args,
       Sanitizers.has(SanitizerKind::Address))
     CmdArgs.push_back("-fno-assume-sane-operator-new");
 
+  // Flags for -fsanitize=alloc-token.
+  if (AllocTokenFastABI)
+    CmdArgs.push_back("-fsanitize-alloc-token-fast-abi");
+  if (AllocTokenExtended)
+    CmdArgs.push_back("-fsanitize-alloc-token-extended");
+
   // libFuzzer wants to intercept calls to certain library functions, so the
   // following -fno-builtin-* flags force the compiler to emit interposable
   // libcalls to these functions. Other sanitizers effectively do the same thing
diff --git a/clang/lib/Driver/ToolChain.cpp b/clang/lib/Driver/ToolChain.cpp
index a9041d2..3d5cac6 100644
--- a/clang/lib/Driver/ToolChain.cpp
+++ b/clang/lib/Driver/ToolChain.cpp
@@ -1623,7 +1623,8 @@ SanitizerMask ToolChain::getSupportedSanitizers() const {
       SanitizerKind::CFICastStrict | SanitizerKind::FloatDivideByZero |
       SanitizerKind::KCFI | SanitizerKind::UnsignedIntegerOverflow |
       SanitizerKind::UnsignedShiftBase | SanitizerKind::ImplicitConversion |
-      SanitizerKind::Nullability | SanitizerKind::LocalBounds;
+      SanitizerKind::Nullability | SanitizerKind::LocalBounds |
+      SanitizerKind::AllocToken;
   if (getTriple().getArch() == llvm::Triple::x86 ||
       getTriple().getArch() == llvm::Triple::x86_64 ||
       getTriple().getArch() == llvm::Triple::arm ||
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index 107b9ff..d326a81 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -7618,6 +7618,8 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
   // features enabled through -Xclang -target-feature flags.
   SanitizeArgs.addArgs(TC, Args, CmdArgs, InputType);
 
+  Args.AddLastArg(CmdArgs, options::OPT_falloc_token_max_EQ);
+
 #if CLANG_ENABLE_CIR
   // Forward -mmlir arguments to to the MLIR option parser.
   for (const Arg *A : Args.filtered(options::OPT_mmlir)) {
diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp
index 49ee53f..16cc1db 100644
--- a/clang/lib/Driver/ToolChains/CommonArgs.cpp
+++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp
@@ -2231,7 +2231,7 @@ static unsigned ParseDebugDefaultVersion(const ToolChain &TC,
     return 0;
 
   unsigned Value = 0;
-  if (StringRef(A->getValue()).getAsInteger(10, Value) || Value > 5 ||
+  if (StringRef(A->getValue()).getAsInteger(10, Value) || Value > 6 ||
       Value < 2)
     TC.getDriver().Diag(diag::err_drv_invalid_int_value)
         << A->getAsString(Args) << A->getValue();
@@ -2244,13 +2244,14 @@ unsigned tools::DwarfVersionNum(StringRef ArgValue) {
       .Case("-gdwarf-3", 3)
       .Case("-gdwarf-4", 4)
       .Case("-gdwarf-5", 5)
+      .Case("-gdwarf-6", 6)
       .Default(0);
 }
 
 const Arg *tools::getDwarfNArg(const ArgList &Args) {
   return Args.getLastArg(options::OPT_gdwarf_2, options::OPT_gdwarf_3,
                          options::OPT_gdwarf_4, options::OPT_gdwarf_5,
-                         options::OPT_gdwarf);
+                         options::OPT_gdwarf_6, options::OPT_gdwarf);
 }
 
 unsigned tools::getDwarfVersion(const ToolChain &TC,
diff --git a/clang/lib/Driver/ToolChains/UEFI.cpp b/clang/lib/Driver/ToolChains/UEFI.cpp
index 75adbf1..d2be147 100644
--- a/clang/lib/Driver/ToolChains/UEFI.cpp
+++ b/clang/lib/Driver/ToolChains/UEFI.cpp
@@ -24,7 +24,9 @@ using namespace clang;
 using namespace llvm::opt;
 
 UEFI::UEFI(const Driver &D, const llvm::Triple &Triple, const ArgList &Args)
-    : ToolChain(D, Triple, Args) {}
+    : ToolChain(D, Triple, Args) {
+  getProgramPaths().push_back(getDriver().Dir);
+}
 
 Tool *UEFI::buildLinker() const { return new tools::uefi::Linker(*this); }
 
diff --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp
index 50fd50a..292adce 100644
--- a/clang/lib/Frontend/CompilerInvocation.cpp
+++ b/clang/lib/Frontend/CompilerInvocation.cpp
@@ -1833,6 +1833,10 @@ void CompilerInvocationBase::GenerateCodeGenArgs(const CodeGenOptions &Opts,
        serializeSanitizerKinds(Opts.SanitizeAnnotateDebugInfo))
     GenerateArg(Consumer, OPT_fsanitize_annotate_debug_info_EQ, Sanitizer);
 
+  if (Opts.AllocTokenMax)
+    GenerateArg(Consumer, OPT_falloc_token_max_EQ,
+                std::to_string(*Opts.AllocTokenMax));
+
   if (!Opts.EmitVersionIdentMetadata)
     GenerateArg(Consumer, OPT_Qn);
 
@@ -2346,6 +2350,15 @@ bool CompilerInvocation::ParseCodeGenArgs(CodeGenOptions &Opts, ArgList &Args,
     }
   }
 
+  if (const auto *Arg = Args.getLastArg(options::OPT_falloc_token_max_EQ)) {
+    StringRef S = Arg->getValue();
+    uint64_t Value = 0;
+    if (S.getAsInteger(0, Value))
+      Diags.Report(diag::err_drv_invalid_value) << Arg->getAsString(Args) << S;
+    else
+      Opts.AllocTokenMax = Value;
+  }
+
   Opts.EmitVersionIdentMetadata = Args.hasFlag(OPT_Qy, OPT_Qn, true);
 
   if (!LangOpts->CUDAIsDevice)
diff --git a/clang/lib/Frontend/InitPreprocessor.cpp b/clang/lib/Frontend/InitPreprocessor.cpp
index 877ab02..b899fb9 100644
--- a/clang/lib/Frontend/InitPreprocessor.cpp
+++ b/clang/lib/Frontend/InitPreprocessor.cpp
@@ -1530,6 +1530,8 @@ static void InitializePredefinedMacros(const TargetInfo &TI,
     Builder.defineMacro("__SANITIZE_HWADDRESS__");
   if (LangOpts.Sanitize.has(SanitizerKind::Thread))
     Builder.defineMacro("__SANITIZE_THREAD__");
+  if (LangOpts.Sanitize.has(SanitizerKind::AllocToken))
+    Builder.defineMacro("__SANITIZE_ALLOC_TOKEN__");
 
   // Target OS macro definitions.
   if (PPOpts.DefineTargetOSMacros) {
diff --git a/clang/lib/Headers/avx2intrin.h b/clang/lib/Headers/avx2intrin.h
index 31759c5..4aaca2d 100644
--- a/clang/lib/Headers/avx2intrin.h
+++ b/clang/lib/Headers/avx2intrin.h
@@ -1035,10 +1035,9 @@ _mm256_hsubs_epi16(__m256i __a, __m256i __b)
 /// \param __b
 ///    A 256-bit vector containing one of the source operands.
 /// \returns A 256-bit vector of [16 x i16] containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maddubs_epi16(__m256i __a, __m256i __b)
-{
-    return (__m256i)__builtin_ia32_pmaddubsw256((__v32qi)__a, (__v32qi)__b);
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_maddubs_epi16(__m256i __a, __m256i __b) {
+  return (__m256i)__builtin_ia32_pmaddubsw256((__v32qi)__a, (__v32qi)__b);
 }
 
 /// Multiplies corresponding 16-bit elements of two 256-bit vectors of
@@ -1067,9 +1066,8 @@ _mm256_maddubs_epi16(__m256i __a, __m256i __b)
 /// \param __b
 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
 /// \returns A 256-bit vector of [8 x i32] containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_madd_epi16(__m256i __a, __m256i __b)
-{
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_madd_epi16(__m256i __a, __m256i __b) {
   return (__m256i)__builtin_ia32_pmaddwd256((__v16hi)__a, (__v16hi)__b);
 }
 
diff --git a/clang/lib/Headers/avx512bwintrin.h b/clang/lib/Headers/avx512bwintrin.h
index c36bd81..473fe94 100644
--- a/clang/lib/Headers/avx512bwintrin.h
+++ b/clang/lib/Headers/avx512bwintrin.h
@@ -1064,12 +1064,12 @@ _mm512_maskz_mulhi_epu16(__mmask32 __U, __m512i __A, __m512i __B) {
       (__v32hi)_mm512_setzero_si512());
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_maddubs_epi16(__m512i __X, __m512i __Y) {
   return (__m512i)__builtin_ia32_pmaddubsw512((__v64qi)__X, (__v64qi)__Y);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_mask_maddubs_epi16(__m512i __W, __mmask32 __U, __m512i __X,
                           __m512i __Y) {
   return (__m512i)__builtin_ia32_selectw_512((__mmask32) __U,
@@ -1077,26 +1077,26 @@ _mm512_mask_maddubs_epi16(__m512i __W, __mmask32 __U, __m512i __X,
                                         (__v32hi)__W);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_maskz_maddubs_epi16(__mmask32 __U, __m512i __X, __m512i __Y) {
   return (__m512i)__builtin_ia32_selectw_512((__mmask32) __U,
                                         (__v32hi)_mm512_maddubs_epi16(__X, __Y),
                                         (__v32hi)_mm512_setzero_si512());
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_madd_epi16(__m512i __A, __m512i __B) {
   return (__m512i)__builtin_ia32_pmaddwd512((__v32hi)__A, (__v32hi)__B);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_mask_madd_epi16(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) {
   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
                                            (__v16si)_mm512_madd_epi16(__A, __B),
                                            (__v16si)__W);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_maskz_madd_epi16(__mmask16 __U, __m512i __A, __m512i __B) {
   return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
                                            (__v16si)_mm512_madd_epi16(__A, __B),
diff --git a/clang/lib/Headers/avx512vlbwintrin.h b/clang/lib/Headers/avx512vlbwintrin.h
index 5e6daa8..81e4cbb9 100644
--- a/clang/lib/Headers/avx512vlbwintrin.h
+++ b/clang/lib/Headers/avx512vlbwintrin.h
@@ -1295,21 +1295,21 @@ _mm256_maskz_permutex2var_epi16 (__mmask16 __U, __m256i __A, __m256i __I,
                               (__v16hi)_mm256_setzero_si256());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_mask_maddubs_epi16(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y) {
   return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
                                             (__v8hi)_mm_maddubs_epi16(__X, __Y),
                                             (__v8hi)__W);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_maskz_maddubs_epi16(__mmask8 __U, __m128i __X, __m128i __Y) {
   return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
                                             (__v8hi)_mm_maddubs_epi16(__X, __Y),
                                             (__v8hi)_mm_setzero_si128());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_mask_maddubs_epi16(__m256i __W, __mmask16 __U, __m256i __X,
                           __m256i __Y) {
   return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
@@ -1317,35 +1317,35 @@ _mm256_mask_maddubs_epi16(__m256i __W, __mmask16 __U, __m256i __X,
                                         (__v16hi)__W);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_maskz_maddubs_epi16(__mmask16 __U, __m256i __X, __m256i __Y) {
   return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
                                         (__v16hi)_mm256_maddubs_epi16(__X, __Y),
                                         (__v16hi)_mm256_setzero_si256());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_mask_madd_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
   return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
                                              (__v4si)_mm_madd_epi16(__A, __B),
                                              (__v4si)__W);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_maskz_madd_epi16(__mmask8 __U, __m128i __A, __m128i __B) {
   return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
                                              (__v4si)_mm_madd_epi16(__A, __B),
                                              (__v4si)_mm_setzero_si128());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_mask_madd_epi16(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) {
   return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
                                             (__v8si)_mm256_madd_epi16(__A, __B),
                                             (__v8si)__W);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_maskz_madd_epi16(__mmask8 __U, __m256i __A, __m256i __B) {
   return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
                                             (__v8si)_mm256_madd_epi16(__A, __B),
diff --git a/clang/lib/Headers/emmintrin.h b/clang/lib/Headers/emmintrin.h
index 6597e7e..454e9a2 100644
--- a/clang/lib/Headers/emmintrin.h
+++ b/clang/lib/Headers/emmintrin.h
@@ -2290,8 +2290,8 @@ _mm_avg_epu16(__m128i __a, __m128i __b) {
 ///    A 128-bit signed [8 x i16] vector.
 /// \returns A 128-bit signed [4 x i32] vector containing the sums of products
 ///    of both parameters.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_madd_epi16(__m128i __a,
-                                                            __m128i __b) {
+static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_madd_epi16(__m128i __a, __m128i __b) {
   return (__m128i)__builtin_ia32_pmaddwd128((__v8hi)__a, (__v8hi)__b);
 }
 
diff --git a/clang/lib/Headers/mmintrin.h b/clang/lib/Headers/mmintrin.h
index 5f61753..aca78e6 100644
--- a/clang/lib/Headers/mmintrin.h
+++ b/clang/lib/Headers/mmintrin.h
@@ -679,11 +679,10 @@ _mm_subs_pu16(__m64 __m1, __m64 __m2) {
 ///    A 64-bit integer vector of [4 x i16].
 /// \returns A 64-bit integer vector of [2 x i32] containing the sums of
 ///    products of both parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
-_mm_madd_pi16(__m64 __m1, __m64 __m2)
-{
-    return __trunc64(__builtin_ia32_pmaddwd128((__v8hi)__anyext128(__m1),
-                                               (__v8hi)__anyext128(__m2)));
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR
+_mm_madd_pi16(__m64 __m1, __m64 __m2) {
+  return __trunc64(__builtin_ia32_pmaddwd128((__v8hi)__zext128(__m1),
+                                             (__v8hi)__zext128(__m2)));
 }
 
 /// Multiplies each 16-bit signed integer element of the first 64-bit
diff --git a/clang/lib/Headers/tmmintrin.h b/clang/lib/Headers/tmmintrin.h
index d40f0c5..3fc9f98 100644
--- a/clang/lib/Headers/tmmintrin.h
+++ b/clang/lib/Headers/tmmintrin.h
@@ -23,6 +23,9 @@
 
 #define __trunc64(x)                                                           \
   (__m64) __builtin_shufflevector((__v2di)(x), __extension__(__v2di){}, 0)
+#define __zext128(x)                                                           \
+  (__m128i) __builtin_shufflevector((__v2si)(x), __extension__(__v2si){}, 0,   \
+                                    1, 2, 3)
 #define __anyext128(x)                                                         \
   (__m128i) __builtin_shufflevector((__v2si)(x), __extension__(__v2si){}, 0,   \
                                     1, -1, -1)
@@ -504,10 +507,9 @@ _mm_hsubs_pi16(__m64 __a, __m64 __b)
 ///    \a R5 := (\a __a10 * \a __b10) + (\a __a11 * \a __b11) \n
 ///    \a R6 := (\a __a12 * \a __b12) + (\a __a13 * \a __b13) \n
 ///    \a R7 := (\a __a14 * \a __b14) + (\a __a15 * \a __b15)
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maddubs_epi16(__m128i __a, __m128i __b)
-{
-    return (__m128i)__builtin_ia32_pmaddubsw128((__v16qi)__a, (__v16qi)__b);
+static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_maddubs_epi16(__m128i __a, __m128i __b) {
+  return (__m128i)__builtin_ia32_pmaddubsw128((__v16qi)__a, (__v16qi)__b);
 }
 
 /// Multiplies corresponding pairs of packed 8-bit unsigned integer
@@ -534,11 +536,10 @@ _mm_maddubs_epi16(__m128i __a, __m128i __b)
 ///    \a R1 := (\a __a2 * \a __b2) + (\a __a3 * \a __b3) \n
 ///    \a R2 := (\a __a4 * \a __b4) + (\a __a5 * \a __b5) \n
 ///    \a R3 := (\a __a6 * \a __b6) + (\a __a7 * \a __b7)
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_maddubs_pi16(__m64 __a, __m64 __b)
-{
-    return __trunc64(__builtin_ia32_pmaddubsw128((__v16qi)__anyext128(__a),
-                                                 (__v16qi)__anyext128(__b)));
+static __inline__ __m64 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_maddubs_pi16(__m64 __a, __m64 __b) {
+  return __trunc64(__builtin_ia32_pmaddubsw128((__v16qi)__zext128(__a),
+                                               (__v16qi)__zext128(__b)));
 }
 
 /// Multiplies packed 16-bit signed integer values, truncates the 32-bit
@@ -796,6 +797,7 @@ _mm_sign_pi32(__m64 __a, __m64 __b)
 }
 
 #undef __anyext128
+#undef __zext128
 #undef __trunc64
 #undef __DEFAULT_FN_ATTRS
 #undef __DEFAULT_FN_ATTRS_CONSTEXPR
diff --git a/clang/lib/Sema/AnalysisBasedWarnings.cpp b/clang/lib/Sema/AnalysisBasedWarnings.cpp
index 8606227..e9ca8ce 100644
--- a/clang/lib/Sema/AnalysisBasedWarnings.cpp
+++ b/clang/lib/Sema/AnalysisBasedWarnings.cpp
@@ -2605,6 +2605,17 @@ public:
 #endif
   }
 
+  void handleUnsafeUniquePtrArrayAccess(const DynTypedNode &Node,
+                                        bool IsRelatedToDecl,
+                                        ASTContext &Ctx) override {
+    SourceLocation Loc;
+    std::string Message;
+
+    Loc = Node.get<Stmt>()->getBeginLoc();
+    S.Diag(Loc, diag::warn_unsafe_buffer_usage_unique_ptr_array_access)
+        << Node.getSourceRange();
+  }
+
   bool isSafeBufferOptOut(const SourceLocation &Loc) const override {
     return S.PP.isSafeBufferOptOut(S.getSourceManager(), Loc);
   }
diff --git a/clang/test/CIR/CodeGen/cast.cpp b/clang/test/CIR/CodeGen/cast.cpp
index 7afa955..844d4df 100644
--- a/clang/test/CIR/CodeGen/cast.cpp
+++ b/clang/test/CIR/CodeGen/cast.cpp
@@ -131,3 +131,36 @@ void bitcast() {
 
 // LLVM: %[[D_VEC:.*]] = load <2 x double>, ptr {{.*}}, align 16
 // LLVM: %[[I_VEC:.*]] = bitcast <2 x double> %[[D_VEC]] to <4 x i32>
+
+void f(long int start) {
+  void *p = (void*)start;
+}
+// CIR: %[[L:.*]] = cir.load {{.*}} : !cir.ptr<!s64i>, !s64i
+// CIR: %[[MID:.*]] = cir.cast integral %[[L]] : !s64i -> !u64i
+// CIR:          cir.cast int_to_ptr %[[MID]] : !u64i -> !cir.ptr<!void>
+
+// LLVM-LABEL: define{{.*}} void @_Z1fl(i64 %0)
+// LLVM: %[[ADDR:.*]] = alloca i64, i64 1, align 8
+// LLVM: %[[PADDR:.*]] = alloca ptr, i64 1, align 8
+// LLVM: store i64 %0, ptr %[[ADDR]], align 8
+// LLVM: %[[L:.*]] = load i64, ptr %[[ADDR]], align 8
+// LLVM: %[[PTR:.*]] = inttoptr i64 %[[L]] to ptr
+// LLVM: store ptr %[[PTR]], ptr %[[PADDR]], align 8
+// LLVM: ret void
+
+struct A { int x; };
+
+void int_cast(long ptr) {
+  ((A *)ptr)->x = 0;
+}
+// CIR: cir.cast int_to_ptr {{.*}} : !u64i -> !cir.ptr<!rec_A>
+// LLVM: inttoptr {{.*}} to ptr
+
+void null_cast(long) {
+  *(int *)0 = 0;
+  ((A *)0)->x = 0;
+}
+// CIR:    %[[NULLPTR:.*]] = cir.const #cir.ptr<null> : !cir.ptr<!s32i>
+// CIR:    cir.store{{.*}} %{{.*}}, %[[NULLPTR]] : !s32i, !cir.ptr<!s32i>
+// CIR:    %[[NULLPTR_A:.*]] = cir.const #cir.ptr<null> : !cir.ptr<!rec_A>
+// CIR:    %[[A_X:.*]] = cir.get_member %[[NULLPTR_A]][0] {name = "x"} : !cir.ptr<!rec_A> -> !cir.ptr<!s32i>
diff --git a/clang/test/CIR/CodeGen/dtors.cpp b/clang/test/CIR/CodeGen/dtors.cpp
index 66554b7..49952a7 100644
--- a/clang/test/CIR/CodeGen/dtors.cpp
+++ b/clang/test/CIR/CodeGen/dtors.cpp
@@ -171,3 +171,40 @@ bool test_temp_and() { return make_temp(1) && make_temp(2); }
 // OGCG:   br label %[[CLEANUP_DONE]]
 // OGCG: [[CLEANUP_DONE]]:
 // OGCG:   call void @_ZN1BD2Ev(ptr {{.*}} %[[REF_TMP0]])
+
+struct C {
+  ~C();
+};
+
+struct D {
+  int n;
+  C c;
+  ~D() {}
+};
+
+// CIR: cir.func {{.*}} @_ZN1DD2Ev
+// CIR:   %[[C:.*]] = cir.get_member %{{.*}}[1] {name = "c"}
+// CIR:   cir.call @_ZN1CD1Ev(%[[C]])
+
+// LLVM: define {{.*}} void @_ZN1DD2Ev
+// LLVM:   %[[C:.*]] = getelementptr %struct.D, ptr %{{.*}}, i32 0, i32 1
+// LLVM:   call void @_ZN1CD1Ev(ptr %[[C]])
+
+// This destructor is defined after the calling function in OGCG.
+
+void test_nested_dtor() {
+  D d;
+}
+
+// CIR: cir.func{{.*}} @_Z16test_nested_dtorv()
+// CIR:   cir.call @_ZN1DD2Ev(%{{.*}})
+
+// LLVM: define {{.*}} void @_Z16test_nested_dtorv()
+// LLVM:   call void @_ZN1DD2Ev(ptr %{{.*}})
+
+// OGCG: define {{.*}} void @_Z16test_nested_dtorv()
+// OGCG:   call void @_ZN1DD2Ev(ptr {{.*}} %{{.*}})
+
+// OGCG: define {{.*}} void @_ZN1DD2Ev
+// OGCG:   %[[C:.*]] = getelementptr inbounds i8, ptr %{{.*}}, i64 4
+// OGCG:   call void @_ZN1CD1Ev(ptr {{.*}} %[[C]])
diff --git a/clang/test/CIR/CodeGen/global-init.cpp b/clang/test/CIR/CodeGen/global-init.cpp
index 0c19e68..2afb5a5 100644
--- a/clang/test/CIR/CodeGen/global-init.cpp
+++ b/clang/test/CIR/CodeGen/global-init.cpp
@@ -1,9 +1,10 @@
 // RUN: %clang_cc1 -std=c++17 -triple x86_64-unknown-linux-gnu -fclangir -emit-cir -mmlir --mlir-print-ir-before=cir-lowering-prepare %s -o %t.cir 2> %t-before.cir
 // RUN: FileCheck --input-file=%t-before.cir %s --check-prefix=CIR-BEFORE-LPP
 // RUN: FileCheck --input-file=%t.cir %s --check-prefix=CIR
-
-// Note: The LoweringPrepare work isn't yet complete. We still need to create
-//       the global ctor list attribute.
+// RUN: %clang_cc1 -std=c++17 -triple x86_64-unknown-linux-gnu -fclangir -emit-llvm %s -o %t-cir.ll
+// RUN: FileCheck --input-file=%t-cir.ll %s --check-prefix=LLVM
+// RUN: %clang_cc1 -std=c++17 -triple x86_64-unknown-linux-gnu -emit-llvm %s -o %t.ll
+// RUN: FileCheck --input-file=%t.ll %s --check-prefix=OGCG
 
 struct NeedsCtor {
   NeedsCtor();
@@ -15,6 +16,9 @@ NeedsCtor needsCtor;
 // CIR-BEFORE-LPP:   %[[THIS:.*]] = cir.get_global @needsCtor : !cir.ptr<!rec_NeedsCtor>
 // CIR-BEFORE-LPP:   cir.call @_ZN9NeedsCtorC1Ev(%[[THIS]]) : (!cir.ptr<!rec_NeedsCtor>) -> ()
 
+// CIR: module @{{.*}} attributes {
+// CIR-SAME: cir.global_ctors = [#cir.global_ctor<"_GLOBAL__sub_I_[[FILENAME:.*]]", 65535>]
+
 // CIR: cir.global external @needsCtor = #cir.zero : !rec_NeedsCtor
 // CIR: cir.func internal private @__cxx_global_var_init() {
 // CIR:   %0 = cir.get_global @needsCtor : !cir.ptr<!rec_NeedsCtor>
@@ -24,3 +28,22 @@ NeedsCtor needsCtor;
 // CIR:   cir.call @__cxx_global_var_init() : () -> ()
 // CIR:   cir.return
 // CIR: }
+
+// LLVM: @needsCtor = global %struct.NeedsCtor zeroinitializer, align 1
+// LLVM: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 65535, ptr @_GLOBAL__sub_I_[[FILENAME:.*]], ptr null }]
+// LLVM: declare void @_ZN9NeedsCtorC1Ev(ptr)
+
+// LLVM: define internal void @__cxx_global_var_init()
+// LLVM:   call void @_ZN9NeedsCtorC1Ev(ptr @needsCtor)
+
+// LLVM: define void @_GLOBAL__sub_I_[[FILENAME]]()
+// LLVM:   call void @__cxx_global_var_init()
+
+// OGCG: @needsCtor = global %struct.NeedsCtor zeroinitializer, align 1
+// OGCG: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 65535, ptr @_GLOBAL__sub_I_[[FILENAME:.*]], ptr null }]
+
+// OGCG: define internal void @__cxx_global_var_init() {{.*}} section ".text.startup" {
+// OGCG:   call void @_ZN9NeedsCtorC1Ev(ptr noundef nonnull align 1 dereferenceable(1) @needsCtor)
+
+// OGCG: define internal void @_GLOBAL__sub_I_[[FILENAME]]() {{.*}} section ".text.startup" {
+// OGCG:   call void @__cxx_global_var_init()
diff --git a/clang/test/CIR/CodeGenOpenACC/private-clause-array-recipes-int.cpp b/clang/test/CIR/CodeGenOpenACC/private-clause-array-recipes-int.cpp
index e83e548..74cb567 100644
--- a/clang/test/CIR/CodeGenOpenACC/private-clause-array-recipes-int.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/private-clause-array-recipes-int.cpp
@@ -21,7 +21,7 @@ void do_things(unsigned A, unsigned B) {
   ;
 
   T TwoArr[5][5];
-#pragma acc parallel private(TwoArr[B][B])
+#pragma acc parallel private(TwoArr[A:B][A:B])
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt2__ZTSA5_A5_i : !cir.ptr<!cir.array<!cir.array<!s32i x 5> x 5>> init {
 // CHECK-NEXT: ^bb0(%arg0: !cir.ptr<!cir.array<!cir.array<!s32i x 5> x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND2:.*]]: !acc.data_bounds_ty {{.*}}):
 // CHECK-NEXT: %[[TL_ALLOCA:.*]] = cir.alloca !cir.array<!cir.array<!s32i x 5> x 5>, !cir.ptr<!cir.array<!cir.array<!s32i x 5> x 5>>, ["openacc.private.init"] {alignment = 4 : i64}
@@ -30,7 +30,7 @@ void do_things(unsigned A, unsigned B) {
   ;
 #pragma acc parallel private(TwoArr[B][A:B])
   ;
-#pragma acc parallel private(TwoArr[A:B][A:B])
+#pragma acc parallel private(TwoArr[B][B])
   ;
 #pragma acc parallel private(TwoArr)
 // CHECK-NEXT: acc.private.recipe @privatization__ZTSA5_A5_i : !cir.ptr<!cir.array<!cir.array<!s32i x 5> x 5>> init {
diff --git a/clang/test/CodeGen/X86/avx2-builtins.c b/clang/test/CodeGen/X86/avx2-builtins.c
index 4299b18..55f18f9 100644
--- a/clang/test/CodeGen/X86/avx2-builtins.c
+++ b/clang/test/CodeGen/X86/avx2-builtins.c
@@ -810,12 +810,14 @@ __m256i test_mm256_madd_epi16(__m256i a, __m256i b) {
   // CHECK: call <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16> %{{.*}}, <16 x i16> %{{.*}})
   return _mm256_madd_epi16(a, b);
 }
+TEST_CONSTEXPR(match_v8si(_mm256_madd_epi16((__m256i)(__v16hi){1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}, (__m256i)(__v16hi){10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160}), 50, 250, 610, 1130, 1810, 2650, 3650, 4810));
 
 __m256i test_mm256_maddubs_epi16(__m256i a, __m256i b) {
   // CHECK-LABEL: test_mm256_maddubs_epi16
   // CHECK: call <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8> %{{.*}}, <32 x i8> %{{.*}})
   return _mm256_maddubs_epi16(a, b);
 }
+TEST_CONSTEXPR(match_v16hi(_mm256_maddubs_epi16((__m256i)(__v32qi){1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7,8}, (__m256i)(__v32qs){2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4, 5, 6, 7, 8, -1, -1, -2, -2, -3, -3, -4, -4, -5, -5, -6, -6, -7, -7, -8, -8}), 5, 18, 39, 68, 15, 42, 77, 120, -3, -14, -33, -60, -15, -42, -77, -120));
 
 __m128i test_mm_maskload_epi32(int const *a, __m128i m) {
   // CHECK-LABEL: test_mm_maskload_epi32
diff --git a/clang/test/CodeGen/X86/avx512bw-builtins.c b/clang/test/CodeGen/X86/avx512bw-builtins.c
index bd19363..af1c904 100644
--- a/clang/test/CodeGen/X86/avx512bw-builtins.c
+++ b/clang/test/CodeGen/X86/avx512bw-builtins.c
@@ -1650,35 +1650,46 @@ __m512i test_mm512_maddubs_epi16(__m512i __X, __m512i __Y) {
   // CHECK: @llvm.x86.avx512.pmaddubs.w.512
   return _mm512_maddubs_epi16(__X,__Y); 
 }
+TEST_CONSTEXPR(match_v32hi(_mm512_maddubs_epi16((__m512i)(__v64qi){2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3}, (__m512i)(__v64qs){5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5}), -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5));
+
 __m512i test_mm512_mask_maddubs_epi16(__m512i __W, __mmask32 __U, __m512i __X, __m512i __Y) {
   // CHECK-LABEL: test_mm512_mask_maddubs_epi16
   // CHECK: @llvm.x86.avx512.pmaddubs.w.512
   // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_mask_maddubs_epi16(__W,__U,__X,__Y); 
 }
+TEST_CONSTEXPR(match_v32hi(_mm512_mask_maddubs_epi16((__m512i)(__v32hi){-1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15, -16, -17, -18, -19, -20, -21, -22, -23, -24, -25, -26, -27, -28, -29, -30, -31, -32}, 0x0000FFFF, (__m512i)(__v64qi){2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3}, (__m512i)(__v64qs){5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5}), -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -17, -18, -19, -20, -21, -22, -23, -24, -25, -26, -27, -28, -29, -30, -31, -32));
+
 __m512i test_mm512_maskz_maddubs_epi16(__mmask32 __U, __m512i __X, __m512i __Y) {
   // CHECK-LABEL: test_mm512_maskz_maddubs_epi16
   // CHECK: @llvm.x86.avx512.pmaddubs.w.512
   // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_maskz_maddubs_epi16(__U,__X,__Y); 
 }
+TEST_CONSTEXPR(match_v32hi(_mm512_maskz_maddubs_epi16(0x0000FFFF, (__m512i)(__v64qi){2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3}, (__m512i)(__v64qs){5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5, 5, -5}), -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, -5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0));
+
 __m512i test_mm512_madd_epi16(__m512i __A, __m512i __B) {
   // CHECK-LABEL: test_mm512_madd_epi16
   // CHECK: @llvm.x86.avx512.pmaddw.d.512
   return _mm512_madd_epi16(__A,__B); 
 }
+TEST_CONSTEXPR(match_v16si(_mm512_madd_epi16((__m512i)(__v32hi){1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4}, (__m512i)(__v32hi){1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8}), 3, 7, 22, 30, 9, 21, 44, 60, 3, 7, 22, 30, 9, 21, 44, 60));
+
 __m512i test_mm512_mask_madd_epi16(__m512i __W, __mmask16 __U, __m512i __A,      __m512i __B) {
   // CHECK-LABEL: test_mm512_mask_madd_epi16
   // CHECK: @llvm.x86.avx512.pmaddw.d.512
   // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_mask_madd_epi16(__W,__U,__A,__B); 
 }
+TEST_CONSTEXPR(match_v16si(_mm512_mask_madd_epi16((__m512i)(__v16si){100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1100, 1200, 1300, 1400, 1500, 1600}, 0xF0F0, (__m512i)(__v32hi){1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4}, (__m512i)(__v32hi){1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8}), 100, 200, 300, 400, 9, 21, 44, 60, 900, 1000, 1100, 1200, 9, 21, 44, 60));
+
 __m512i test_mm512_maskz_madd_epi16(__mmask16 __U, __m512i __A, __m512i __B) {
   // CHECK-LABEL: test_mm512_maskz_madd_epi16
   // CHECK: @llvm.x86.avx512.pmaddw.d.512
   // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_maskz_madd_epi16(__U,__A,__B); 
 }
+TEST_CONSTEXPR(match_v16si(_mm512_maskz_madd_epi16(0xF0F0, (__m512i)(__v32hi){1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4}, (__m512i)(__v32hi){1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8}), 0, 0, 0, 0, 9, 21, 44, 60, 0, 0, 0, 0, 9, 21, 44, 60));
 
 __m256i test_mm512_cvtsepi16_epi8(__m512i __A) {
   // CHECK-LABEL: test_mm512_cvtsepi16_epi8
diff --git a/clang/test/CodeGen/X86/avx512vlbw-builtins.c b/clang/test/CodeGen/X86/avx512vlbw-builtins.c
index 1fe1ec0..c0e46de 100644
--- a/clang/test/CodeGen/X86/avx512vlbw-builtins.c
+++ b/clang/test/CodeGen/X86/avx512vlbw-builtins.c
@@ -1865,6 +1865,7 @@ __m128i test_mm_mask_maddubs_epi16(__m128i __W, __mmask8 __U, __m128i __X, __m12
   // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
   return _mm_mask_maddubs_epi16(__W, __U, __X, __Y); 
 }
+TEST_CONSTEXPR(match_v8hi(_mm_mask_maddubs_epi16((__m128i)(__v8hi){1, 2, 3, 4, 5, 6, 7, 8}, 0x0F, (__m128i)(__v16qi){1, 1, 2, 2, 3, 3, 4, 4, 1, 2, 3, 4, 5, 6, 7, 8}, (__m128i)(__v16qs){2, 3, 4, 5, 6, 7, 8, 9, -1, -1, -2, -2, -3, -3, -4, -4}), 5, 18, 39, 68, 5, 6, 7, 8));
 
 __m128i test_mm_maskz_maddubs_epi16(__mmask8 __U, __m128i __X, __m128i __Y) {
   // CHECK-LABEL: test_mm_maskz_maddubs_epi16
@@ -1872,6 +1873,7 @@ __m128i test_mm_maskz_maddubs_epi16(__mmask8 __U, __m128i __X, __m128i __Y) {
   // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
   return _mm_maskz_maddubs_epi16(__U, __X, __Y); 
 }
+TEST_CONSTEXPR(match_v8hi(_mm_maskz_maddubs_epi16(0x0F, (__m128i)(__v16qi){1, 1, 2, 2, 3, 3, 4, 4, 1, 2, 3, 4, 5, 6, 7, 8}, (__m128i)(__v16qs){2, 3, 4, 5, 6, 7, 8, 9, -1, -1, -2, -2, -3, -3, -4, -4}), 5, 18, 39, 68, 0, 0, 0, 0));
 
 __m256i test_mm256_mask_maddubs_epi16(__m256i __W, __mmask16 __U, __m256i __X, __m256i __Y) {
   // CHECK-LABEL: test_mm256_mask_maddubs_epi16
@@ -1879,6 +1881,7 @@ __m256i test_mm256_mask_maddubs_epi16(__m256i __W, __mmask16 __U, __m256i __X, _
   // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
   return _mm256_mask_maddubs_epi16(__W, __U, __X, __Y); 
 }
+TEST_CONSTEXPR(match_v16hi(_mm256_mask_maddubs_epi16((__m256i)(__v16hi){-1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15, -16}, 0x00FF, (__m256i)(__v32qi){1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7,8}, (__m256i)(__v32qs){2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4, 5, 6, 7, 8, -1, -1, -2, -2, -3, -3, -4, -4, -5, -5, -6, -6, -7, -7, -8, -8}), 5, 18, 39, 68, 15, 42, 77, 120, -9, -10, -11, -12, -13, -14, -15, -16));
 
 __m256i test_mm256_maskz_maddubs_epi16(__mmask16 __U, __m256i __X, __m256i __Y) {
   // CHECK-LABEL: test_mm256_maskz_maddubs_epi16
@@ -1886,6 +1889,7 @@ __m256i test_mm256_maskz_maddubs_epi16(__mmask16 __U, __m256i __X, __m256i __Y)
   // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
   return _mm256_maskz_maddubs_epi16(__U, __X, __Y); 
 }
+TEST_CONSTEXPR(match_v16hi(_mm256_maskz_maddubs_epi16(0x00FF, (__m256i)(__v32qi){1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7,8}, (__m256i)(__v32qs){2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4, 5, 6, 7, 8, -1, -1, -2, -2, -3, -3, -4, -4, -5, -5, -6, -6, -7, -7, -8, -8}), 5, 18, 39, 68, 15, 42, 77, 120, 0, 0, 0, 0, 0, 0, 0, 0));
 
 __m128i test_mm_mask_madd_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_mask_madd_epi16
@@ -1893,6 +1897,7 @@ __m128i test_mm_mask_madd_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i
   // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm_mask_madd_epi16(__W, __U, __A, __B); 
 }
+TEST_CONSTEXPR(match_v4si(_mm_mask_madd_epi16((__m128i)(__v4si){1, 2, 3, 4}, 0x3, (__m128i)(__v8hi){1, 2, 3, 4, 5, 6, 7, 8}, (__m128i)(__v8hi){9, 10, 11, 12, 13, 14, 15, 16}), 29, 81, 3, 4));
 
 __m128i test_mm_maskz_madd_epi16(__mmask8 __U, __m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_maskz_madd_epi16
@@ -1900,6 +1905,7 @@ __m128i test_mm_maskz_madd_epi16(__mmask8 __U, __m128i __A, __m128i __B) {
   // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm_maskz_madd_epi16(__U, __A, __B); 
 }
+TEST_CONSTEXPR(match_v4si(_mm_maskz_madd_epi16(0x3, (__m128i)(__v8hi){1, 2, 3, 4, 5, 6, 7, 8}, (__m128i)(__v8hi){9, 10, 11, 12, 13, 14, 15, 16}), 29, 81, 0, 0));
 
 __m256i test_mm256_mask_madd_epi16(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_mask_madd_epi16
@@ -1907,6 +1913,7 @@ __m256i test_mm256_mask_madd_epi16(__m256i __W, __mmask8 __U, __m256i __A, __m25
   // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_mask_madd_epi16(__W, __U, __A, __B); 
 }
+TEST_CONSTEXPR(match_v8si(_mm256_mask_madd_epi16((__m256i)(__v8si){1, 2, 3, 4, 5, 6, 7, 8}, 0x0F, (__m256i)(__v16hi){1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}, (__m256i)(__v16hi){10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160}), 50, 250, 610, 1130, 5, 6, 7, 8));
 
 __m256i test_mm256_maskz_madd_epi16(__mmask8 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_maskz_madd_epi16
@@ -1914,6 +1921,7 @@ __m256i test_mm256_maskz_madd_epi16(__mmask8 __U, __m256i __A, __m256i __B) {
   // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_maskz_madd_epi16(__U, __A, __B); 
 }
+TEST_CONSTEXPR(match_v8si(_mm256_maskz_madd_epi16(0x0F, (__m256i)(__v16hi){1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}, (__m256i)(__v16hi){10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160}), 50, 250, 610, 1130, 0, 0, 0, 0));
 
 __m128i test_mm_cvtsepi16_epi8(__m128i __A) {
   // CHECK-LABEL: test_mm_cvtsepi16_epi8
diff --git a/clang/test/CodeGen/X86/mmx-builtins.c b/clang/test/CodeGen/X86/mmx-builtins.c
index a4494b69..2b45b92 100644
--- a/clang/test/CodeGen/X86/mmx-builtins.c
+++ b/clang/test/CodeGen/X86/mmx-builtins.c
@@ -355,12 +355,14 @@ __m64 test_mm_madd_pi16(__m64 a, __m64 b) {
   // CHECK: call <4 x i32> @llvm.x86.sse2.pmadd.wd(
   return _mm_madd_pi16(a, b);
 }
+TEST_CONSTEXPR(match_v2si(_mm_madd_pi16((__m64)(__v4hi){+1, -2, +3, -4}, (__m64)(__v4hi){-10, +8, +6, -4}), -26, 34));
 
 __m64 test_mm_maddubs_pi16(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_maddubs_pi16
   // CHECK: call <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(
   return _mm_maddubs_pi16(a, b);
 }
+TEST_CONSTEXPR(match_v4hi(_mm_maddubs_pi16((__m64)(__v8qi){16, 17, 18, 19, 20, 21, 22, 23}, (__m64)(__v8qi){1, 2, 3, 4, 5, 0, 7, 8}), 50, 130, 100, 338));
 
 void test_mm_maskmove_si64(__m64 d, __m64 n, char *p) {
   // CHECK-LABEL: test_mm_maskmove_si64
diff --git a/clang/test/CodeGen/X86/sse2-builtins.c b/clang/test/CodeGen/X86/sse2-builtins.c
index 8428fd6..ade7ef3 100644
--- a/clang/test/CodeGen/X86/sse2-builtins.c
+++ b/clang/test/CodeGen/X86/sse2-builtins.c
@@ -852,6 +852,7 @@ __m128i test_mm_madd_epi16(__m128i A, __m128i B) {
   // CHECK: call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   return _mm_madd_epi16(A, B);
 }
+TEST_CONSTEXPR(match_v4si(_mm_madd_epi16((__m128i)(__v8hi){1, 2, 3, 4, 5, 6, 7, 8}, (__m128i)(__v8hi){9, 10, 11, 12, 13, 14, 15, 16}), 29, 81, 149, 233));
 
 void test_mm_maskmoveu_si128(__m128i A, __m128i B, char* C) {
   // CHECK-LABEL: test_mm_maskmoveu_si128
diff --git a/clang/test/CodeGen/X86/ssse3-builtins.c b/clang/test/CodeGen/X86/ssse3-builtins.c
index 56ff73f0..5885768 100644
--- a/clang/test/CodeGen/X86/ssse3-builtins.c
+++ b/clang/test/CodeGen/X86/ssse3-builtins.c
@@ -96,6 +96,7 @@ __m128i test_mm_maddubs_epi16(__m128i a, __m128i b) {
   // CHECK: call <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
   return _mm_maddubs_epi16(a, b);
 }
+TEST_CONSTEXPR(match_v8hi(_mm_maddubs_epi16((__m128i)(__v16qi){1, 1, 2, 2, 3, 3, 4, 4, 1, 2, 3, 4, 5, 6, 7, 8}, (__m128i)(__v16qs){2, 3, 4, 5, 6, 7, 8, 9, -1, -1, -2, -2, -3, -3, -4, -4}), 5, 18, 39, 68, -3, -14, -33, -60));
 
 __m128i test_mm_mulhrs_epi16(__m128i a, __m128i b) {
   // CHECK-LABEL: test_mm_mulhrs_epi16
diff --git a/clang/test/CodeGen/alloc-token-ignorelist.c b/clang/test/CodeGen/alloc-token-ignorelist.c
new file mode 100644
index 0000000..954e6e5
--- /dev/null
+++ b/clang/test/CodeGen/alloc-token-ignorelist.c
@@ -0,0 +1,27 @@
+// Test AllocToken respects ignorelist for functions and files.
+//
+// RUN: %clang_cc1 -fsanitize=alloc-token -triple x86_64-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-ALLOW
+//
+// RUN: echo "fun:excluded_by_all" > %t.func.ignorelist
+// RUN: %clang_cc1 -fsanitize=alloc-token -fsanitize-ignorelist=%t.func.ignorelist -triple x86_64-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-FUN
+//
+// RUN: echo "src:%s" | sed -e 's/\\/\\\\/g' > %t.file.ignorelist
+// RUN: %clang_cc1 -fsanitize=alloc-token -fsanitize-ignorelist=%t.file.ignorelist -triple x86_64-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SRC
+
+extern void* malloc(unsigned long size);
+
+// CHECK-LABEL: define{{.*}} @excluded_by_all(
+void* excluded_by_all(unsigned long size) {
+    // CHECK-ALLOW: call ptr @__alloc_token_malloc(
+    // CHECK-FUN: call ptr @malloc(
+    // CHECK-SRC: call ptr @malloc(
+    return malloc(size);
+}
+
+// CHECK-LABEL: define{{.*}} @excluded_by_src(
+void* excluded_by_src(unsigned long size) {
+    // CHECK-ALLOW: call ptr @__alloc_token_malloc(
+    // CHECK-FUN: call ptr @__alloc_token_malloc(
+    // CHECK-SRC: call ptr @malloc(
+    return malloc(size);
+}
diff --git a/clang/test/CodeGen/alloc-token-lower.c b/clang/test/CodeGen/alloc-token-lower.c
new file mode 100644
index 0000000..75197bb
--- /dev/null
+++ b/clang/test/CodeGen/alloc-token-lower.c
@@ -0,0 +1,22 @@
+// Test optimization pipelines do not interfere with AllocToken lowering, and we
+// pass on function attributes correctly.
+//
+// RUN: %clang_cc1     -fsanitize=alloc-token -triple x86_64-linux-gnu -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -O1 -fsanitize=alloc-token -triple x86_64-linux-gnu -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -O2 -fsanitize=alloc-token -triple x86_64-linux-gnu -emit-llvm %s -o - | FileCheck %s
+
+typedef __typeof(sizeof(int)) size_t;
+
+void *malloc(size_t size);
+
+// CHECK-LABEL: @test_malloc(
+// CHECK: call{{.*}} ptr @__alloc_token_malloc(i64 noundef 4, i64 0)
+void *test_malloc() {
+  return malloc(sizeof(int));
+}
+
+// CHECK-LABEL: @no_sanitize_malloc(
+// CHECK: call{{.*}} ptr @malloc(i64 noundef 4)
+void *no_sanitize_malloc(size_t size) __attribute__((no_sanitize("alloc-token"))) {
+  return malloc(sizeof(int));
+}
diff --git a/clang/test/CodeGen/alloc-token.c b/clang/test/CodeGen/alloc-token.c
new file mode 100644
index 0000000..d1160ad
--- /dev/null
+++ b/clang/test/CodeGen/alloc-token.c
@@ -0,0 +1,37 @@
+// RUN: %clang_cc1 -fsanitize=alloc-token -triple x86_64-linux-gnu -emit-llvm -disable-llvm-passes %s -o - | FileCheck %s
+
+typedef __typeof(sizeof(int)) size_t;
+
+void *aligned_alloc(size_t alignment, size_t size);
+void *malloc(size_t size);
+void *calloc(size_t num, size_t size);
+void *realloc(void *ptr, size_t size);
+void *reallocarray(void *ptr, size_t nmemb, size_t size);
+void *memalign(size_t alignment, size_t size);
+void *valloc(size_t size);
+void *pvalloc(size_t size);
+int posix_memalign(void **memptr, size_t alignment, size_t size);
+
+void *sink;
+
+// CHECK-LABEL: define dso_local void @test_malloc_like(
+// CHECK: call ptr @malloc(i64 noundef 4)
+// CHECK: call ptr @calloc(i64 noundef 3, i64 noundef 4)
+// CHECK: call ptr @realloc(ptr noundef {{.*}}, i64 noundef 8)
+// CHECK: call ptr @reallocarray(ptr noundef {{.*}}, i64 noundef 5, i64 noundef 8)
+// CHECK: call align 128 ptr @aligned_alloc(i64 noundef 128, i64 noundef 1024)
+// CHECK: call align 16 ptr @memalign(i64 noundef 16, i64 noundef 256)
+// CHECK: call ptr @valloc(i64 noundef 4096)
+// CHECK: call ptr @pvalloc(i64 noundef 8192)
+// CHECK: call i32 @posix_memalign(ptr noundef @sink, i64 noundef 64, i64 noundef 4)
+void test_malloc_like() {
+  sink = malloc(sizeof(int));
+  sink = calloc(3, sizeof(int));
+  sink = realloc(sink, sizeof(long));
+  sink = reallocarray(sink, 5, sizeof(long));
+  sink = aligned_alloc(128, 1024);
+  sink = memalign(16, 256);
+  sink = valloc(4096);
+  sink = pvalloc(8192);
+  posix_memalign(&sink, 64, sizeof(int));
+}
diff --git a/clang/test/CodeGen/dwarf-version.c b/clang/test/CodeGen/dwarf-version.c
index 258c258..500f66c 100644
--- a/clang/test/CodeGen/dwarf-version.c
+++ b/clang/test/CodeGen/dwarf-version.c
@@ -2,6 +2,7 @@
 // RUN: %clang -target x86_64-linux-gnu -gdwarf-3 -S -emit-llvm -o - %s | FileCheck %s --check-prefix=VER3
 // RUN: %clang -target x86_64-linux-gnu -gdwarf-4 -S -emit-llvm -o - %s | FileCheck %s --check-prefix=VER4
 // RUN: %clang -target x86_64-linux-gnu -gdwarf-5 -S -emit-llvm -o - %s | FileCheck %s --check-prefix=VER5
+// RUN: %clang -target x86_64-linux-gnu -gdwarf-6 -S -emit-llvm -o - %s | FileCheck %s --check-prefix=VER6
 // RUN: %clang -target x86_64-linux-gnu -g -S -emit-llvm -o - %s | FileCheck %s --check-prefix=VER5
 // RUN: %clang -target x86_64-linux-gnu -gdwarf -S -emit-llvm -o - %s | FileCheck %s --check-prefix=VER5
 // RUN: %clang --target=i386-pc-solaris -g -S -emit-llvm -o - %s | FileCheck %s --check-prefix=VER5
@@ -61,6 +62,7 @@ int main (void) {
 // VER3: !{i32 7, !"Dwarf Version", i32 3}
 // VER4: !{i32 7, !"Dwarf Version", i32 4}
 // VER5: !{i32 7, !"Dwarf Version", i32 5}
+// VER6: !{i32 7, !"Dwarf Version", i32 6}
 // UNSUPPORTED-VER5: error: unsupported option '-gdwarf-5'
 
 // NODWARF-NOT: !"Dwarf Version"
diff --git a/clang/test/CodeGenCXX/alloc-token.cpp b/clang/test/CodeGenCXX/alloc-token.cpp
new file mode 100644
index 0000000..52bad9c
--- /dev/null
+++ b/clang/test/CodeGenCXX/alloc-token.cpp
@@ -0,0 +1,141 @@
+// RUN: %clang_cc1 -fsanitize=alloc-token -triple x86_64-linux-gnu -std=c++20 -fexceptions -fcxx-exceptions -emit-llvm -disable-llvm-passes %s -o - | FileCheck %s
+
+#include "../Analysis/Inputs/system-header-simulator-cxx.h"
+extern "C" {
+void *aligned_alloc(size_t alignment, size_t size);
+void *malloc(size_t size);
+void *calloc(size_t num, size_t size);
+void *realloc(void *ptr, size_t size);
+void *reallocarray(void *ptr, size_t nmemb, size_t size);
+void *memalign(size_t alignment, size_t size);
+void *valloc(size_t size);
+void *pvalloc(size_t size);
+int posix_memalign(void **memptr, size_t alignment, size_t size);
+
+struct __sized_ptr_t {
+  void *p;
+  size_t n;
+};
+enum class __hot_cold_t : uint8_t;
+__sized_ptr_t __size_returning_new(size_t size);
+__sized_ptr_t __size_returning_new_hot_cold(size_t, __hot_cold_t);
+__sized_ptr_t __size_returning_new_aligned(size_t, std::align_val_t);
+__sized_ptr_t __size_returning_new_aligned_hot_cold(size_t, std::align_val_t,  __hot_cold_t);
+}
+
+void *sink; // prevent optimizations from removing the calls
+
+// CHECK-LABEL: define dso_local void @_Z16test_malloc_likev(
+// CHECK: call ptr @malloc(i64 noundef 4)
+// CHECK: call ptr @calloc(i64 noundef 3, i64 noundef 4)
+// CHECK: call ptr @realloc(ptr noundef {{.*}}, i64 noundef 8)
+// CHECK: call ptr @reallocarray(ptr noundef {{.*}}, i64 noundef 5, i64 noundef 8)
+// CHECK: call align 128 ptr @aligned_alloc(i64 noundef 128, i64 noundef 1024)
+// CHECK: call ptr @memalign(i64 noundef 16, i64 noundef 256)
+// CHECK: call ptr @valloc(i64 noundef 4096)
+// CHECK: call ptr @pvalloc(i64 noundef 8192)
+// CHECK: call i32 @posix_memalign(ptr noundef @sink, i64 noundef 64, i64 noundef 4)
+void test_malloc_like() {
+  sink = malloc(sizeof(int));
+  sink = calloc(3, sizeof(int));
+  sink = realloc(sink, sizeof(long));
+  sink = reallocarray(sink, 5, sizeof(long));
+  sink = aligned_alloc(128, 1024);
+  sink = memalign(16, 256);
+  sink = valloc(4096);
+  sink = pvalloc(8192);
+  posix_memalign(&sink, 64, sizeof(int));
+}
+
+// CHECK-LABEL: define dso_local void @_Z17test_operator_newv(
+// CHECK: call noalias noundef nonnull ptr @_Znwm(i64 noundef 4)
+// CHECK: call noalias noundef nonnull ptr @_Znwm(i64 noundef 4)
+void test_operator_new() {
+  sink = __builtin_operator_new(sizeof(int));
+  sink = ::operator new(sizeof(int));
+}
+
+// CHECK-LABEL: define dso_local void @_Z25test_operator_new_nothrowv(
+// CHECK: call noalias noundef ptr @_ZnwmRKSt9nothrow_t(i64 noundef 4, ptr noundef nonnull align 1 dereferenceable(1) @_ZSt7nothrow)
+// CHECK: call noalias noundef ptr @_ZnwmRKSt9nothrow_t(i64 noundef 4, ptr noundef nonnull align 1 dereferenceable(1) @_ZSt7nothrow)
+void test_operator_new_nothrow() {
+  sink = __builtin_operator_new(sizeof(int), std::nothrow);
+  sink = ::operator new(sizeof(int), std::nothrow);
+}
+
+// CHECK-LABEL: define dso_local noundef ptr @_Z8test_newv(
+// CHECK: call noalias noundef nonnull ptr @_Znwm(i64 noundef 4){{.*}} !alloc_token [[META_INT:![0-9]+]]
+int *test_new() {
+  return new int;
+}
+
+// CHECK-LABEL: define dso_local noundef ptr @_Z14test_new_arrayv(
+// CHECK: call noalias noundef nonnull ptr @_Znam(i64 noundef 40){{.*}} !alloc_token [[META_INT]]
+int *test_new_array() {
+  return new int[10];
+}
+
+// CHECK-LABEL: define dso_local noundef ptr @_Z16test_new_nothrowv(
+// CHECK: call noalias noundef ptr @_ZnwmRKSt9nothrow_t(i64 noundef 4, ptr noundef nonnull align 1 dereferenceable(1) @_ZSt7nothrow){{.*}} !alloc_token [[META_INT]]
+int *test_new_nothrow() {
+  return new (std::nothrow) int;
+}
+
+// CHECK-LABEL: define dso_local noundef ptr @_Z22test_new_array_nothrowv(
+// CHECK: call noalias noundef ptr @_ZnamRKSt9nothrow_t(i64 noundef 40, ptr noundef nonnull align 1 dereferenceable(1) @_ZSt7nothrow){{.*}} !alloc_token [[META_INT]]
+int *test_new_array_nothrow() {
+  return new (std::nothrow) int[10];
+}
+
+// CHECK-LABEL: define dso_local void @_Z23test_size_returning_newv(
+// CHECK: call { ptr, i64 } @__size_returning_new(i64 noundef 8)
+// CHECK: call { ptr, i64 } @__size_returning_new_hot_cold(i64 noundef 8, i8 noundef zeroext 1)
+// CHECK: call { ptr, i64 } @__size_returning_new_aligned(i64 noundef 8, i64 noundef 32)
+// CHECK: call { ptr, i64 } @__size_returning_new_aligned_hot_cold(i64 noundef 8, i64 noundef 32, i8 noundef zeroext 1)
+void test_size_returning_new() {
+  sink = __size_returning_new(sizeof(long)).p;
+  sink = __size_returning_new_hot_cold(sizeof(long), __hot_cold_t{1}).p;
+  sink = __size_returning_new_aligned(sizeof(long), std::align_val_t{32}).p;
+  sink = __size_returning_new_aligned_hot_cold(sizeof(long), std::align_val_t{32}, __hot_cold_t{1}).p;
+}
+
+class TestClass {
+public:
+  virtual void Foo();
+  virtual ~TestClass();
+  int data[16];
+};
+
+void may_throw();
+
+// CHECK-LABEL: define dso_local noundef ptr @_Z27test_exception_handling_newv(
+// CHECK: invoke noalias noundef nonnull ptr @_Znwm(i64 noundef 72)
+// CHECK-NEXT: !alloc_token [[META_TESTCLASS:![0-9]+]]
+TestClass *test_exception_handling_new() {
+  try {
+    TestClass *obj = new TestClass();
+    may_throw();
+    return obj;
+  } catch (...) {
+    return nullptr;
+  }
+}
+
+// CHECK-LABEL: define dso_local noundef ptr @_Z14test_new_classv(
+// CHECK: call noalias noundef nonnull ptr @_Znwm(i64 noundef 72){{.*}} !alloc_token [[META_TESTCLASS]]
+TestClass *test_new_class() {
+  TestClass *obj = new TestClass();
+  obj->data[0] = 42;
+  return obj;
+}
+
+// CHECK-LABEL: define dso_local noundef ptr @_Z20test_new_class_arrayv(
+// CHECK: call noalias noundef nonnull ptr @_Znam(i64 noundef 728){{.*}} !alloc_token [[META_TESTCLASS]]
+TestClass *test_new_class_array() {
+  TestClass* arr = new TestClass[10];
+  arr[0].data[0] = 123;
+  return arr;
+}
+
+// CHECK: [[META_INT]] = !{!"int"}
+// CHECK: [[META_TESTCLASS]] = !{!"TestClass"}
diff --git a/clang/test/Driver/fsanitize-alloc-token.c b/clang/test/Driver/fsanitize-alloc-token.c
new file mode 100644
index 0000000..2964f60
--- /dev/null
+++ b/clang/test/Driver/fsanitize-alloc-token.c
@@ -0,0 +1,43 @@
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=alloc-token %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-TOKEN-ALLOC
+// CHECK-TOKEN-ALLOC: "-fsanitize=alloc-token"
+
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=alloc-token -fno-sanitize=alloc-token %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-NO-TOKEN-ALLOC
+// CHECK-NO-TOKEN-ALLOC-NOT: "-fsanitize=alloc-token"
+
+// RUN: %clang --target=x86_64-linux-gnu -flto -fvisibility=hidden -fno-sanitize-ignorelist -fsanitize=alloc-token,undefined,cfi %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-COMPATIBLE
+// CHECK-COMPATIBLE: "-fsanitize={{.*}}alloc-token"
+
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=alloc-token -fsanitize-minimal-runtime %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-MINIMAL
+// CHECK-MINIMAL: "-fsanitize=alloc-token"
+// CHECK-MINIMAL: "-fsanitize-minimal-runtime"
+
+// RUN: %clang --target=arm-arm-non-eabi -fsanitize=alloc-token %s -### 2>&1 | FileCheck %s -check-prefix=CHECK-BAREMETAL
+// RUN: %clang --target=aarch64-none-elf -fsanitize=alloc-token %s -### 2>&1 | FileCheck %s -check-prefix=CHECK-BAREMETAL
+// CHECK-BAREMETAL: "-fsanitize=alloc-token"
+
+// RUN: not %clang --target=x86_64-linux-gnu -fsanitize=alloc-token,address %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-INCOMPATIBLE-ADDRESS
+// CHECK-INCOMPATIBLE-ADDRESS: error: invalid argument '-fsanitize=alloc-token' not allowed with '-fsanitize=address'
+
+// RUN: not %clang --target=x86_64-linux-gnu -fsanitize=alloc-token,memory %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-INCOMPATIBLE-MEMORY
+// CHECK-INCOMPATIBLE-MEMORY: error: invalid argument '-fsanitize=alloc-token' not allowed with '-fsanitize=memory'
+
+// RUN: not %clang --target=x86_64-linux-gnu -fsanitize=alloc-token -fsanitize-trap=alloc-token %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-INCOMPATIBLE-TRAP
+// CHECK-INCOMPATIBLE-TRAP: error: unsupported argument 'alloc-token' to option '-fsanitize-trap='
+
+// RUN: not %clang --target=x86_64-linux-gnu %s -fsanitize=alloc-token -fsanitize-recover=alloc-token -### 2>&1 | FileCheck %s --check-prefix=CHECK-INCOMPATIBLE-RECOVER
+// CHECK-INCOMPATIBLE-RECOVER: unsupported argument 'alloc-token' to option '-fsanitize-recover='
+
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=alloc-token -fsanitize-alloc-token-fast-abi %s -### 2>&1 | FileCheck -check-prefix=CHECK-FASTABI %s
+// CHECK-FASTABI: "-fsanitize-alloc-token-fast-abi"
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=alloc-token -fsanitize-alloc-token-fast-abi -fno-sanitize-alloc-token-fast-abi %s -### 2>&1 | FileCheck -check-prefix=CHECK-NOFASTABI %s
+// CHECK-NOFASTABI-NOT: "-fsanitize-alloc-token-fast-abi"
+
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=alloc-token -fsanitize-alloc-token-extended %s -### 2>&1 | FileCheck -check-prefix=CHECK-EXTENDED %s
+// CHECK-EXTENDED: "-fsanitize-alloc-token-extended"
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=alloc-token -fsanitize-alloc-token-extended -fno-sanitize-alloc-token-extended %s -### 2>&1 | FileCheck -check-prefix=CHECK-NOEXTENDED %s
+// CHECK-NOEXTENDED-NOT: "-fsanitize-alloc-token-extended"
+
+// RUN: %clang --target=x86_64-linux-gnu -falloc-token-max=0 -falloc-token-max=42 %s -### 2>&1 | FileCheck -check-prefix=CHECK-MAX %s
+// CHECK-MAX: "-falloc-token-max=42"
+// RUN: not %clang --target=x86_64-linux-gnu -fsanitize=alloc-token -falloc-token-max=-1 %s 2>&1 | FileCheck -check-prefix=CHECK-INVALID-MAX %s
+// CHECK-INVALID-MAX: error: invalid value
diff --git a/clang/test/Driver/fuse-ld.c b/clang/test/Driver/fuse-ld.c
index f807434..cdcd512 100644
--- a/clang/test/Driver/fuse-ld.c
+++ b/clang/test/Driver/fuse-ld.c
@@ -101,3 +101,8 @@
 // RUN:   | FileCheck %s --check-prefix CHECK-WINDOWS-MSVC-BFD
 // CHECK-WINDOWS-MSVC-BFD: "{{.*}}ld.bfd"
 // CHECK-WINDOWS-MSVC-BFD-SAME: "-o"
+
+// RUN: %clang %s -### -fuse-ld=lld \
+// RUN:     --target=x86_64-unknown-uefi 2>&1 \
+// RUN:   | FileCheck %s --check-prefix CHECK-UEFI-LLD-LINK
+// CHECK-UEFI-LLD-LINK: "{{.*}}lld-link
diff --git a/clang/test/Driver/uefi-constructed-args.c b/clang/test/Driver/uefi-constructed-args.c
index c06cce3..b06920f 100644
--- a/clang/test/Driver/uefi-constructed-args.c
+++ b/clang/test/Driver/uefi-constructed-args.c
@@ -12,3 +12,8 @@
 // CHECK-SAME: "/entry:EfiMain"
 // CHECK-SAME: "/tsaware:no"
 // CHECK-SAME: "/debug"
+
+// RUN: %clang -### --target=x86_64-unknown-uefi -print-search-dirs 2>&1 \
+// RUN:     | FileCheck -check-prefixes=PROGPATH %s
+// PROGPATH: InstalledDir: [[DRIVER_INSTALLED_DIR:.*]]
+// PROGPATH: programs: =[[DRIVER_INSTALLED_DIR]]
diff --git a/clang/test/Preprocessor/alloc_token.cpp b/clang/test/Preprocessor/alloc_token.cpp
new file mode 100644
index 0000000..0c51bfb
--- /dev/null
+++ b/clang/test/Preprocessor/alloc_token.cpp
@@ -0,0 +1,10 @@
+// RUN: %clang_cc1 -E -fsanitize=alloc-token %s -o - | FileCheck --check-prefix=CHECK-SANITIZE %s
+// RUN: %clang_cc1 -E  %s -o - | FileCheck --check-prefix=CHECK-DEFAULT %s
+
+#if __SANITIZE_ALLOC_TOKEN__
+// CHECK-SANITIZE: has_sanitize_alloc_token
+int has_sanitize_alloc_token();
+#else
+// CHECK-DEFAULT: no_sanitize_alloc_token
+int no_sanitize_alloc_token();
+#endif
diff --git a/clang/test/Preprocessor/print-header-json.c b/clang/test/Preprocessor/print-header-json.c
index 057dcc2..e0533cd 100644
--- a/clang/test/Preprocessor/print-header-json.c
+++ b/clang/test/Preprocessor/print-header-json.c
@@ -22,6 +22,7 @@
 #include "system2.h"
 
 // RUN: rm %t.txt
+// RUN: rm -rf %t
 // RUN: env CC_PRINT_HEADERS_FORMAT=json CC_PRINT_HEADERS_FILTERING=direct-per-file CC_PRINT_HEADERS_FILE=%t.txt %clang -fsyntax-only -I %S/Inputs/print-header-json -isystem %S/Inputs/print-header-json/system -fmodules -fimplicit-module-maps -fmodules-cache-path=%t %s -o /dev/null
 // RUN: cat %t.txt | FileCheck %s --check-prefix=SUPPORTED_PERFILE_MODULES
 
diff --git a/clang/test/SemaCXX/warn-unsafe-buffer-usage-unique-ptr.cpp b/clang/test/SemaCXX/warn-unsafe-buffer-usage-unique-ptr.cpp
new file mode 100644
index 0000000..789d8a2
--- /dev/null
+++ b/clang/test/SemaCXX/warn-unsafe-buffer-usage-unique-ptr.cpp
@@ -0,0 +1,43 @@
+// RUN: %clang_cc1 -Wno-unused-value -Wunsafe-buffer-usage -fsafe-buffer-usage-suggestions -std=c++20 -verify=expected %s
+
+namespace std {
+inline namespace __1 {
+template <class T> class unique_ptr {
+public:
+  T &operator[](long long i) const;
+};
+} // namespace __1
+} // namespace std
+
+int get_index() {
+  return 4;
+}
+
+void basic_unique_ptr() {
+  std::unique_ptr<int[]> p1;
+  int i = 2;
+  const int j = 3;
+  int k = 0;
+
+  p1[0];  // This is allowed
+
+  p1[k];  // expected-warning{{direct access using operator[] on std::unique_ptr<T[]> is unsafe due to lack of bounds checking}}
+
+  p1[1];  // expected-warning{{direct access using operator[] on std::unique_ptr<T[]> is unsafe due to lack of bounds checking}}
+
+  p1[1L];  // expected-warning{{direct access using operator[] on std::unique_ptr<T[]> is unsafe due to lack of bounds checking}}
+
+  p1[1LL];  // expected-warning{{direct access using operator[] on std::unique_ptr<T[]> is unsafe due to lack of bounds checking}}
+
+  p1[3 * 5];  // expected-warning{{direct access using operator[] on std::unique_ptr<T[]> is unsafe due to lack of bounds checking}}
+
+  p1[i];  // expected-warning{{direct access using operator[] on std::unique_ptr<T[]> is unsafe due to lack of bounds checking}}
+
+  p1[j];  // expected-warning{{direct access using operator[] on std::unique_ptr<T[]> is unsafe due to lack of bounds checking}}
+
+  p1[i + 5]; // expected-warning{{direct access using operator[] on std::unique_ptr<T[]> is unsafe due to lack of bounds checking}}
+
+  p1[get_index()];  // expected-warning{{direct access using operator[] on std::unique_ptr<T[]> is unsafe due to lack of bounds checking}}
+
+}
+
diff --git a/clang/www/c_status.html b/clang/www/c_status.html
index d41b5cc..380f664 100644
--- a/clang/www/c_status.html
+++ b/clang/www/c_status.html
@@ -320,6 +320,57 @@ conformance.</p>
       <td><a href="https://www.open-std.org/jtc1/sc22/wg14/www/docs/n3460.pdf">N3460</a></td>
       <td class="full" align="center">Clang 12</td>
 	</tr>
+	<!-- Brno Aug 2025 Papers -->
+    <tr>
+      <td>Matching of Multi-Dimensional Arrays in Generic Selection Expressions</td>
+      <td><a href="https://www.open-std.org/jtc1/sc22/wg14/www/docs/n3348.pdf">N3348</a></td>
+      <td class="unknown" align="center">Unknown</td>
+	</tr>
+    <tr>
+      <td>The __COUNTER__ predefined macro</td>
+      <td><a href="https://www.open-std.org/jtc1/sc22/wg14/www/docs/n3457.htm">N3457</a></td>
+      <td class="unknown" align="center">Unknown</td>
+	</tr>
+    <tr>
+      <td>Chasing Ghosts I: constant expressions v2</td>
+      <td><a href="https://www.open-std.org/jtc1/sc22/wg14/www/docs/n3558.htm">N3558</a></td>
+      <td class="unknown" align="center">Unknown</td>
+	</tr>
+    <tr>
+      <td>Earthly Demon XV: Definition of Main</td>
+      <td><a href="https://www.open-std.org/jtc1/sc22/wg14/www/docs/n3623.pdf">N3623</a></td>
+      <td class="unknown" align="center">Unknown</td>
+	</tr>
+    <tr>
+      <td>static_assert without UB</td>
+      <td><a href="https://www.open-std.org/jtc1/sc22/wg14/www/docs/n3525.htm">N3525</a></td>
+      <td class="unknown" align="center">Unknown</td>
+	</tr>
+    <tr>
+      <td>Allow calling static inline within extern inline</td>
+      <td><a href="https://www.open-std.org/jtc1/sc22/wg14/www/docs/n3622.txt">N3622</a></td>
+      <td class="unknown" align="center">Unknown</td>
+	</tr>
+    <tr>
+      <td>Generic replacement (v. 2 of quasi-literals)</td>
+      <td><a href="https://www.open-std.org/jtc1/sc22/wg14/www/docs/n3605.pdf">N3605</a></td>
+      <td class="unknown" align="center">Unknown</td>
+	</tr>
+    <tr>
+      <td>Member access of an incomplete object</td>
+      <td><a href="https://www.open-std.org/jtc1/sc22/wg14/www/docs/n3532.pdf">N3532</a></td>
+      <td class="unknown" align="center">Unknown</td>
+	</tr>
+    <tr>
+      <td>Representation of Pointers and nullptr_t</td>
+      <td><a href="https://www.open-std.org/jtc1/sc22/wg14/www/docs/n3563.pdf">N3563</a></td>
+      <td class="unknown" align="center">Unknown</td>
+	</tr>
+    <tr>
+      <td>Classification of the register storage-class specifier</td>
+      <td><a href="https://www.open-std.org/jtc1/sc22/wg14/www/docs/n3544.txt">N3544</a></td>
+      <td class="unknown" align="center">Unknown</td>
+	</tr>
 </table>
 </details>
 
diff --git a/compiler-rt/lib/builtins/cpu_model/aarch64.c b/compiler-rt/lib/builtins/cpu_model/aarch64.c
index be29c90..06bc7c4a 100644
--- a/compiler-rt/lib/builtins/cpu_model/aarch64.c
+++ b/compiler-rt/lib/builtins/cpu_model/aarch64.c
@@ -32,19 +32,21 @@ typedef struct __ifunc_arg_t {
 _Bool __aarch64_have_lse_atomics
     __attribute__((visibility("hidden"), nocommon)) = false;
 
+// The formatter wants to re-order these includes, but doing so is incorrect:
+// clang-format off
 #if defined(__FreeBSD__) || defined(__OpenBSD__)
-// clang-format off: should not reorder sys/auxv.h alphabetically
 #include <sys/auxv.h>
-// clang-format on
 #include "aarch64/hwcap.inc"
 #include "aarch64/lse_atomics/elf_aux_info.inc"
 #elif defined(__Fuchsia__)
 #include "aarch64/hwcap.inc"
 #include "aarch64/lse_atomics/fuchsia.inc"
 #elif defined(__ANDROID__)
+#include <sys/auxv.h>
 #include "aarch64/hwcap.inc"
 #include "aarch64/lse_atomics/android.inc"
-#elif defined(__linux__) && __has_include(<sys/auxv.h>)
+#elif defined(__linux__)
+#include <sys/auxv.h>
 #include "aarch64/hwcap.inc"
 #include "aarch64/lse_atomics/getauxval.inc"
 #elif defined(_WIN32)
@@ -52,6 +54,7 @@ _Bool __aarch64_have_lse_atomics
 #else
 // When unimplemented, we leave __aarch64_have_lse_atomics initialized to false.
 #endif
+// clang-format on
 
 #if !defined(DISABLE_AARCH64_FMV)
 
@@ -74,7 +77,7 @@ struct {
 #elif defined(__ANDROID__)
 #include "aarch64/fmv/hwcap.inc"
 #include "aarch64/fmv/android.inc"
-#elif defined(__linux__) && __has_include(<sys/auxv.h>)
+#elif defined(__linux__)
 #include "aarch64/fmv/hwcap.inc"
 #include "aarch64/fmv/getauxval.inc"
 #elif defined(_WIN32)
diff --git a/compiler-rt/lib/builtins/cpu_model/aarch64/hwcap.inc b/compiler-rt/lib/builtins/cpu_model/aarch64/hwcap.inc
index 159c617..1c53191 100644
--- a/compiler-rt/lib/builtins/cpu_model/aarch64/hwcap.inc
+++ b/compiler-rt/lib/builtins/cpu_model/aarch64/hwcap.inc
@@ -1,8 +1,3 @@
-#if __has_include(<sys/hwcap.h>)
-#include <sys/hwcap.h>
-#define HAVE_SYS_HWCAP_H
-#endif
-
 #ifndef _IFUNC_ARG_HWCAP
 #define _IFUNC_ARG_HWCAP (1ULL << 62)
 #endif
diff --git a/compiler-rt/lib/builtins/cpu_model/aarch64/lse_atomics/android.inc b/compiler-rt/lib/builtins/cpu_model/aarch64/lse_atomics/android.inc
index 94bf64a..8d8a913 100644
--- a/compiler-rt/lib/builtins/cpu_model/aarch64/lse_atomics/android.inc
+++ b/compiler-rt/lib/builtins/cpu_model/aarch64/lse_atomics/android.inc
@@ -1,5 +1,4 @@
 #include <string.h>
-#include <sys/auxv.h>
 #include <sys/system_properties.h>
 
 static bool __isExynos9810(void) {
diff --git a/compiler-rt/lib/builtins/cpu_model/aarch64/lse_atomics/getauxval.inc b/compiler-rt/lib/builtins/cpu_model/aarch64/lse_atomics/getauxval.inc
index 6642c1f..670aba5 100644
--- a/compiler-rt/lib/builtins/cpu_model/aarch64/lse_atomics/getauxval.inc
+++ b/compiler-rt/lib/builtins/cpu_model/aarch64/lse_atomics/getauxval.inc
@@ -1,5 +1,3 @@
-#include <sys/auxv.h>
-
 static void CONSTRUCTOR_ATTRIBUTE init_have_lse_atomics(void) {
   unsigned long hwcap = getauxval(AT_HWCAP);
   __aarch64_have_lse_atomics = (hwcap & HWCAP_ATOMICS) != 0;
diff --git a/flang/test/Examples/omp-atomic.f90 b/flang/test/Examples/omp-atomic.f90
index 5695b62..d7e0a1c 100644
--- a/flang/test/Examples/omp-atomic.f90
+++ b/flang/test/Examples/omp-atomic.f90
@@ -1,6 +1,6 @@
 ! REQUIRES: plugins, examples, shell
 
-! RUN: %flang_fc1 -load %llvmshlibdir/flangOmpReport.so -plugin flang-omp-report -fopenmp %s -o - | FileCheck %s
+! RUN: %flang_fc1 -load %llvmshlibdir/flangOmpReport%pluginext -plugin flang-omp-report -fopenmp %s -o - | FileCheck %s
 
 ! Check OpenMP 2.13.6 atomic Construct
 
diff --git a/flang/test/Examples/omp-declarative-directive.f90 b/flang/test/Examples/omp-declarative-directive.f90
index 4a9ad91..6e1b0be 100644
--- a/flang/test/Examples/omp-declarative-directive.f90
+++ b/flang/test/Examples/omp-declarative-directive.f90
@@ -1,6 +1,6 @@
 ! REQUIRES: plugins, examples, shell
 
-! RUN: %flang_fc1 -load %llvmshlibdir/flangOmpReport.so -plugin flang-omp-report -fopenmp %s -o - | FileCheck %s
+! RUN: %flang_fc1 -load %llvmshlibdir/flangOmpReport%pluginext -plugin flang-omp-report -fopenmp %s -o - | FileCheck %s
 
 ! Check OpenMP declarative directives
 
diff --git a/flang/test/Examples/omp-device-constructs.f90 b/flang/test/Examples/omp-device-constructs.f90
index 916f7c9..ae52f73 100644
--- a/flang/test/Examples/omp-device-constructs.f90
+++ b/flang/test/Examples/omp-device-constructs.f90
@@ -1,6 +1,6 @@
 ! REQUIRES: plugins, examples, shell
 
-!RUN: %flang_fc1 -load %llvmshlibdir/flangOmpReport.so -plugin flang-omp-report -fopenmp %s -o - | FileCheck %s
+!RUN: %flang_fc1 -load %llvmshlibdir/flangOmpReport%pluginext -plugin flang-omp-report -fopenmp %s -o - | FileCheck %s
 
 ! Check OpenMP clause validity for the following directives:
 !     2.10 Device constructs
diff --git a/flang/test/Examples/omp-in-reduction-clause.f90 b/flang/test/Examples/omp-in-reduction-clause.f90
index fc3fff5..ced6722 100644
--- a/flang/test/Examples/omp-in-reduction-clause.f90
+++ b/flang/test/Examples/omp-in-reduction-clause.f90
@@ -1,6 +1,6 @@
 ! REQUIRES: plugins, examples, shell
 
-! RUN: %flang_fc1 -load %llvmshlibdir/flangOmpReport.so -plugin flang-omp-report -fopenmp -fopenmp-version=50 %s -o - | FileCheck %s
+! RUN: %flang_fc1 -load %llvmshlibdir/flangOmpReport%pluginext -plugin flang-omp-report -fopenmp -fopenmp-version=50 %s -o - | FileCheck %s
 
 ! Check for IN_REDUCTION() clause on OpenMP constructs
 
diff --git a/flang/test/Examples/omp-nowait.f90 b/flang/test/Examples/omp-nowait.f90
index 091a952..1d8f9e0 100644
--- a/flang/test/Examples/omp-nowait.f90
+++ b/flang/test/Examples/omp-nowait.f90
@@ -1,6 +1,6 @@
 ! REQUIRES: plugins, examples, shell
 
-! RUN: %flang_fc1 -load %llvmshlibdir/flangOmpReport.so -plugin flang-omp-report -fopenmp %s -o - | FileCheck %s
+! RUN: %flang_fc1 -load %llvmshlibdir/flangOmpReport%pluginext -plugin flang-omp-report -fopenmp %s -o - | FileCheck %s
 
 subroutine sb(n)
 implicit none
diff --git a/flang/test/Examples/omp-order-clause.f90 b/flang/test/Examples/omp-order-clause.f90
index 8d1c3f4..976c750 100644
--- a/flang/test/Examples/omp-order-clause.f90
+++ b/flang/test/Examples/omp-order-clause.f90
@@ -1,6 +1,6 @@
 ! REQUIRES: plugins, examples, shell
 
-! RUN: %flang_fc1 -load %llvmshlibdir/flangOmpReport.so -plugin flang-omp-report -fopenmp -fopenmp-version=50 %s -o - | FileCheck %s
+! RUN: %flang_fc1 -load %llvmshlibdir/flangOmpReport%pluginext -plugin flang-omp-report -fopenmp -fopenmp-version=50 %s -o - | FileCheck %s
 
 ! Check for ORDER([order-modifier :]concurrent) clause on OpenMP constructs
 
diff --git a/flang/test/Examples/omp-sections.f90 b/flang/test/Examples/omp-sections.f90
index a6d2806..96de363 100644
--- a/flang/test/Examples/omp-sections.f90
+++ b/flang/test/Examples/omp-sections.f90
@@ -1,6 +1,6 @@
 ! REQUIRES: plugins, examples, shell
 
-! RUN: %flang_fc1 -load %llvmshlibdir/flangOmpReport.so -plugin flang-omp-report -fopenmp %s -o - | FileCheck %s
+! RUN: %flang_fc1 -load %llvmshlibdir/flangOmpReport%pluginext -plugin flang-omp-report -fopenmp %s -o - | FileCheck %s
 
 subroutine omp_sections()
   integer :: x
diff --git a/libc/config/linux/app.h b/libc/config/linux/app.h
index f3d11da..11ac6ee 100644
--- a/libc/config/linux/app.h
+++ b/libc/config/linux/app.h
@@ -35,17 +35,6 @@ struct TLSImage {
   uintptr_t align;
 };
 
-// Linux manpage on `proc(5)` says that the aux vector is an array of
-// unsigned long pairs.
-// (see: https://man7.org/linux/man-pages/man5/proc.5.html)
-using AuxEntryType = unsigned long;
-// Using the naming convention from `proc(5)`.
-// TODO: Would be nice to use the aux entry structure from elf.h when available.
-struct AuxEntry {
-  AuxEntryType id;
-  AuxEntryType value;
-};
-
 struct Args {
   uintptr_t argc;
 
@@ -70,9 +59,6 @@ struct AppProperties {
 
   // Environment data.
   uintptr_t *env_ptr;
-
-  // Auxiliary vector data.
-  AuxEntry *auxv_ptr;
 };
 
 [[gnu::weak]] extern AppProperties app;
diff --git a/libc/include/llvm-libc-types/CMakeLists.txt b/libc/include/llvm-libc-types/CMakeLists.txt
index 70da983..5f506c4 100644
--- a/libc/include/llvm-libc-types/CMakeLists.txt
+++ b/libc/include/llvm-libc-types/CMakeLists.txt
@@ -39,6 +39,8 @@ add_header(float_t HDR float_t.h)
 add_header(gid_t HDR gid_t.h)
 add_header(uid_t HDR uid_t.h)
 add_header(imaxdiv_t HDR imaxdiv_t.h)
+add_header(in_addr_t HDR in_addr_t.h)
+add_header(in_addr HDR in_addr.h DEPENDS .in_addr_t)
 add_header(ino_t HDR ino_t.h)
 add_header(mbstate_t HDR mbstate_t.h)
 add_header(mode_t HDR mode_t.h)
diff --git a/libc/include/llvm-libc-types/in_addr.h b/libc/include/llvm-libc-types/in_addr.h
new file mode 100644
index 0000000..ab42142
--- /dev/null
+++ b/libc/include/llvm-libc-types/in_addr.h
@@ -0,0 +1,18 @@
+//===-- Definition of in_addr type ----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_TYPES_IN_ADDR_H
+#define LLVM_LIBC_TYPES_IN_ADDR_H
+
+#include "in_addr_t.h"
+
+typedef struct {
+  in_addr_t s_addr;
+} in_addr;
+
+#endif // LLVM_LIBC_TYPES_IN_ADDR_H
diff --git a/libc/include/llvm-libc-types/in_addr_t.h b/libc/include/llvm-libc-types/in_addr_t.h
new file mode 100644
index 0000000..334b073
--- /dev/null
+++ b/libc/include/llvm-libc-types/in_addr_t.h
@@ -0,0 +1,14 @@
+//===-- Definition of in_addr_t type --------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_TYPES_IN_ADDR_T_H
+#define LLVM_LIBC_TYPES_IN_ADDR_T_H
+
+typedef __UINT32_TYPE__ in_addr_t;
+
+#endif // LLVM_LIBC_TYPES_IN_ADDR_T_H
diff --git a/libc/src/__support/File/linux/lseekImpl.h b/libc/src/__support/File/linux/lseekImpl.h
index c22a6c5..47df99a 100644
--- a/libc/src/__support/File/linux/lseekImpl.h
+++ b/libc/src/__support/File/linux/lseekImpl.h
@@ -25,8 +25,9 @@ namespace internal {
 LIBC_INLINE ErrorOr<off_t> lseekimpl(int fd, off_t offset, int whence) {
   off_t result;
 #ifdef SYS_lseek
-  int ret = LIBC_NAMESPACE::syscall_impl<int>(SYS_lseek, fd, offset, whence);
-  result = ret;
+  result = LIBC_NAMESPACE::syscall_impl<off_t>(SYS_lseek, fd, offset, whence);
+  if (result < 0)
+    return Error(-static_cast<int>(result));
 #elif defined(SYS_llseek) || defined(SYS__llseek)
   static_assert(sizeof(size_t) == 4, "size_t must be 32 bits.");
 #ifdef SYS_llseek
@@ -37,11 +38,11 @@ LIBC_INLINE ErrorOr<off_t> lseekimpl(int fd, off_t offset, int whence) {
   off_t offset_64 = offset;
   int ret = LIBC_NAMESPACE::syscall_impl<int>(
       LLSEEK_SYSCALL_NO, fd, offset_64 >> 32, offset_64, &result, whence);
+  if (ret < 0)
+    return Error(-ret);
 #else
 #error "lseek, llseek and _llseek syscalls not available."
 #endif
-  if (ret < 0)
-    return Error(-ret);
   return result;
 }
 
diff --git a/libc/src/__support/OSUtil/linux/CMakeLists.txt b/libc/src/__support/OSUtil/linux/CMakeLists.txt
index f303e54..f6377ca 100644
--- a/libc/src/__support/OSUtil/linux/CMakeLists.txt
+++ b/libc/src/__support/OSUtil/linux/CMakeLists.txt
@@ -25,6 +25,18 @@ add_object_library(
 )
 
 add_header_library(
+  auxv
+  HDRS
+    auxv.h
+  DEPENDS
+    libc.hdr.fcntl_macros
+    libc.src.__support.OSUtil.osutil
+    libc.src.__support.common
+    libc.src.__support.CPP.optional
+    libc.src.__support.threads.callonce
+)
+
+add_header_library(
   getrandom
   HDRS
     getrandom.h
diff --git a/libc/src/__support/OSUtil/linux/auxv.h b/libc/src/__support/OSUtil/linux/auxv.h
new file mode 100644
index 0000000..894868a
--- /dev/null
+++ b/libc/src/__support/OSUtil/linux/auxv.h
@@ -0,0 +1,158 @@
+//===------------- Linux AUXV Header --------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC___SUPPORT_OSUTIL_LINUX_AUXV_H
+#define LLVM_LIBC_SRC___SUPPORT_OSUTIL_LINUX_AUXV_H
+
+#include "hdr/fcntl_macros.h" // For open flags
+#include "src/__support/OSUtil/syscall.h"
+#include "src/__support/common.h"
+#include "src/__support/threads/callonce.h"
+
+#include <linux/auxvec.h> // For AT_ macros
+#include <linux/mman.h>   // For mmap flags
+#include <linux/prctl.h>  // For prctl
+#include <sys/syscall.h>  // For syscall numbers
+
+namespace LIBC_NAMESPACE_DECL {
+
+namespace auxv {
+struct Entry {
+  unsigned long type; // Entry type
+  unsigned long val;  // Integer value
+};
+
+class Vector {
+  LIBC_INLINE_VAR static constexpr Entry END = {AT_NULL, AT_NULL};
+  LIBC_INLINE_VAR static const Entry *entries = &END;
+  LIBC_INLINE_VAR static CallOnceFlag init_flag = callonce_impl::NOT_CALLED;
+  LIBC_INLINE_VAR constexpr static size_t FALLBACK_AUXV_ENTRIES = 64;
+
+  LIBC_INLINE static void fallback_initialize_unsync();
+  LIBC_INLINE static const Entry *get_entries() {
+    if (LIBC_LIKELY(entries != &END))
+      return entries;
+    callonce(&init_flag, fallback_initialize_unsync);
+    return entries;
+  }
+
+public:
+  class Iterator {
+    const Entry *current;
+
+  public:
+    LIBC_INLINE explicit Iterator(const Entry *entry) : current(entry) {}
+    LIBC_INLINE Iterator &operator++() {
+      ++current;
+      return *this;
+    }
+    LIBC_INLINE const Entry &operator*() const { return *current; }
+    LIBC_INLINE bool operator!=(const Iterator &other) const {
+      return current->type != other.current->type;
+    }
+    LIBC_INLINE bool operator==(const Iterator &other) const {
+      return current->type == other.current->type;
+    }
+  };
+  using iterator = Iterator;
+  LIBC_INLINE static Iterator begin() { return Iterator(get_entries()); }
+  LIBC_INLINE static Iterator end() { return Iterator(&END); }
+  LIBC_INLINE static void initialize_unsafe(const Entry *auxv);
+};
+
+// Initializes the auxv entries.
+// This function is intended to be called once inside crt0.
+LIBC_INLINE void Vector::initialize_unsafe(const Entry *auxv) {
+  init_flag = callonce_impl::FINISH;
+  entries = auxv;
+}
+
+// When CRT0 does not setup the global array, this function is called.
+// As its name suggests, this function is not thread-safe and should be
+// backed by a callonce guard.
+// This initialize routine will do a mmap to allocate a memory region.
+// Since auxv tends to live throughout the program lifetime, we do not
+// munmap it.
+[[gnu::cold]]
+LIBC_INLINE void Vector::fallback_initialize_unsync() {
+  constexpr size_t AUXV_MMAP_SIZE = FALLBACK_AUXV_ENTRIES * sizeof(Entry);
+#ifdef SYS_mmap2
+  constexpr int MMAP_SYSNO = SYS_mmap2;
+#else
+  constexpr int MMAP_SYSNO = SYS_mmap;
+#endif
+  long mmap_ret = syscall_impl<long>(MMAP_SYSNO, nullptr, AUXV_MMAP_SIZE,
+                                     PROT_READ | PROT_WRITE,
+                                     MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+  // We do not proceed if mmap fails.
+  if (mmap_ret <= 0)
+    return;
+
+  // Initialize the auxv array with AT_NULL entries.
+  Entry *vector = reinterpret_cast<Entry *>(mmap_ret);
+  for (size_t i = 0; i < FALLBACK_AUXV_ENTRIES; ++i) {
+    vector[i].type = AT_NULL;
+    vector[i].val = AT_NULL;
+  }
+  size_t avaiable_size = AUXV_MMAP_SIZE - sizeof(Entry);
+
+// Attempt 1: use PRCTL to get the auxv.
+// We guarantee that the vector is always padded with AT_NULL entries.
+#ifdef PR_GET_AUXV
+  long prctl_ret = syscall_impl<long>(SYS_prctl, PR_GET_AUXV,
+                                      reinterpret_cast<unsigned long>(vector),
+                                      avaiable_size, 0, 0);
+  if (prctl_ret >= 0) {
+    entries = vector;
+    return;
+  }
+#endif
+
+  // Attempt 2: read /proc/self/auxv.
+#ifdef SYS_openat
+  int fd = syscall_impl<int>(SYS_openat, AT_FDCWD, "/proc/self/auxv",
+                             O_RDONLY | O_CLOEXEC);
+#else
+  int fd = syscall_impl<int>(SYS_open, "/proc/self/auxv", O_RDONLY | O_CLOEXEC);
+#endif
+  if (fd < 0) {
+    syscall_impl<long>(SYS_munmap, vector, AUXV_MMAP_SIZE);
+    return;
+  }
+  uint8_t *cursor = reinterpret_cast<uint8_t *>(vector);
+  bool has_error = false;
+  while (avaiable_size != 0) {
+    long bytes_read = syscall_impl<long>(SYS_read, fd, cursor, avaiable_size);
+    if (bytes_read <= 0) {
+      if (bytes_read == -EINTR)
+        continue;
+      has_error = bytes_read < 0;
+      break;
+    }
+    avaiable_size -= bytes_read;
+    cursor += bytes_read;
+  }
+  syscall_impl<long>(SYS_close, fd);
+  if (has_error) {
+    syscall_impl<long>(SYS_munmap, vector, AUXV_MMAP_SIZE);
+    return;
+  }
+  entries = vector;
+}
+
+LIBC_INLINE cpp::optional<unsigned long> get(unsigned long type) {
+  Vector auxvec;
+  for (const auto &entry : auxvec)
+    if (entry.type == type)
+      return entry.val;
+  return cpp::nullopt;
+}
+} // namespace auxv
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC___SUPPORT_OSUTIL_LINUX_AUXV_H
diff --git a/libc/src/__support/threads/callonce.h b/libc/src/__support/threads/callonce.h
index 5392722..0ccbcf9 100644
--- a/libc/src/__support/threads/callonce.h
+++ b/libc/src/__support/threads/callonce.h
@@ -23,14 +23,8 @@
 #endif
 
 namespace LIBC_NAMESPACE_DECL {
-
-// Common definitions
-using CallOnceCallback = void(void);
-namespace callonce_impl {
-int callonce_slowpath(CallOnceFlag *flag, CallOnceCallback *callback);
-} // namespace callonce_impl
-
-LIBC_INLINE int callonce(CallOnceFlag *flag, CallOnceCallback *callback) {
+template <class CallOnceCallback>
+LIBC_INLINE int callonce(CallOnceFlag *flag, CallOnceCallback callback) {
   if (LIBC_LIKELY(callonce_impl::callonce_fastpath(flag)))
     return 0;
 
diff --git a/libc/src/__support/threads/linux/CMakeLists.txt b/libc/src/__support/threads/linux/CMakeLists.txt
index 14aaad2..39d2c6f 100644
--- a/libc/src/__support/threads/linux/CMakeLists.txt
+++ b/libc/src/__support/threads/linux/CMakeLists.txt
@@ -97,10 +97,8 @@ add_object_library(
                             # value other than 0 is dangerous. We know.
 )
 
-add_object_library(
+add_header_library(
   callonce
-  SRCS
-    callonce.cpp
   HDRS
     ../callonce.h
     callonce.h
diff --git a/libc/src/__support/threads/linux/callonce.cpp b/libc/src/__support/threads/linux/callonce.cpp
deleted file mode 100644
index c6e5f2a..0000000
--- a/libc/src/__support/threads/linux/callonce.cpp
+++ /dev/null
@@ -1,40 +0,0 @@
-//===-- Linux implementation of the callonce function ---------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "src/__support/threads/callonce.h"
-#include "src/__support/macros/config.h"
-#include "src/__support/threads/linux/callonce.h"
-#include "src/__support/threads/linux/futex_utils.h"
-
-namespace LIBC_NAMESPACE_DECL {
-namespace callonce_impl {
-int callonce_slowpath(CallOnceFlag *flag, CallOnceCallback *func) {
-  auto *futex_word = reinterpret_cast<Futex *>(flag);
-
-  FutexWordType not_called = NOT_CALLED;
-
-  // The call_once call can return only after the called function |func|
-  // returns. So, we use futexes to synchronize calls with the same flag value.
-  if (futex_word->compare_exchange_strong(not_called, START)) {
-    func();
-    auto status = futex_word->exchange(FINISH);
-    if (status == WAITING)
-      futex_word->notify_all();
-    return 0;
-  }
-
-  FutexWordType status = START;
-  if (futex_word->compare_exchange_strong(status, WAITING) ||
-      status == WAITING) {
-    futex_word->wait(WAITING);
-  }
-
-  return 0;
-}
-} // namespace callonce_impl
-} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/__support/threads/linux/callonce.h b/libc/src/__support/threads/linux/callonce.h
index b3ea3a5..70872b6 100644
--- a/libc/src/__support/threads/linux/callonce.h
+++ b/libc/src/__support/threads/linux/callonce.h
@@ -26,6 +26,31 @@ static constexpr FutexWordType FINISH = 0x33;
 LIBC_INLINE bool callonce_fastpath(CallOnceFlag *flag) {
   return flag->load(cpp::MemoryOrder::RELAXED) == FINISH;
 }
+
+template <class CallOnceCallback>
+[[gnu::noinline, gnu::cold]] int callonce_slowpath(CallOnceFlag *flag,
+                                                   CallOnceCallback callback) {
+
+  auto *futex_word = reinterpret_cast<Futex *>(flag);
+
+  FutexWordType not_called = NOT_CALLED;
+
+  // The call_once call can return only after the called function |func|
+  // returns. So, we use futexes to synchronize calls with the same flag value.
+  if (futex_word->compare_exchange_strong(not_called, START)) {
+    callback();
+    auto status = futex_word->exchange(FINISH);
+    if (status == WAITING)
+      futex_word->notify_all();
+    return 0;
+  }
+
+  FutexWordType status = START;
+  if (futex_word->compare_exchange_strong(status, WAITING) || status == WAITING)
+    futex_word->wait(WAITING);
+
+  return 0;
+}
 } // namespace callonce_impl
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/pthread/pthread_once.cpp b/libc/src/pthread/pthread_once.cpp
index d78644a..109f70c 100644
--- a/libc/src/pthread/pthread_once.cpp
+++ b/libc/src/pthread/pthread_once.cpp
@@ -17,8 +17,7 @@ namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(int, pthread_once,
                    (pthread_once_t * flag, __pthread_once_func_t func)) {
-  return callonce(reinterpret_cast<CallOnceFlag *>(flag),
-                  reinterpret_cast<CallOnceCallback *>(func));
+  return callonce(reinterpret_cast<CallOnceFlag *>(flag), func);
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/sys/auxv/linux/CMakeLists.txt b/libc/src/sys/auxv/linux/CMakeLists.txt
index 4884184c..39cfb7b 100644
--- a/libc/src/sys/auxv/linux/CMakeLists.txt
+++ b/libc/src/sys/auxv/linux/CMakeLists.txt
@@ -5,15 +5,6 @@ add_entrypoint_object(
   HDRS
     ../getauxval.h
   DEPENDS
-    libc.src.sys.prctl.prctl
-    libc.src.sys.mman.mmap
-    libc.src.sys.mman.munmap
-    libc.src.__support.threads.callonce
-    libc.src.__support.common
-    libc.src.errno.errno
-    libc.config.app_h
-    libc.src.fcntl.open
-    libc.src.unistd.read
-    libc.src.unistd.close
-    libc.include.sys_auxv
+    libc.src.__support.OSUtil.linux.auxv
+    libc.src.__support.libc_errno
 )
diff --git a/libc/src/sys/auxv/linux/getauxval.cpp b/libc/src/sys/auxv/linux/getauxval.cpp
index b50c584..52d5a98 100644
--- a/libc/src/sys/auxv/linux/getauxval.cpp
+++ b/libc/src/sys/auxv/linux/getauxval.cpp
@@ -7,227 +7,14 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/sys/auxv/getauxval.h"
-#include "config/app.h"
-#include "hdr/fcntl_macros.h"
-#include "src/__support/OSUtil/fcntl.h"
-#include "src/__support/common.h"
+#include "src/__support/OSUtil/linux/auxv.h"
 #include "src/__support/libc_errno.h"
-#include "src/__support/macros/config.h"
-#include <linux/auxvec.h>
-
-// for guarded initialization
-#include "src/__support/threads/callonce.h"
-#include "src/__support/threads/linux/futex_word.h"
-
-// -----------------------------------------------------------------------------
-// TODO: This file should not include other public libc functions. Calling other
-// public libc functions is an antipattern within LLVM-libc. This needs to be
-// cleaned up. DO NOT COPY THIS.
-// -----------------------------------------------------------------------------
-
-// for mallocing the global auxv
-#include "src/sys/mman/mmap.h"
-#include "src/sys/mman/munmap.h"
-
-// for reading /proc/self/auxv
-#include "src/sys/prctl/prctl.h"
-#include "src/unistd/read.h"
-
-// getauxval will work either with or without __cxa_atexit support.
-// In order to detect if __cxa_atexit is supported, we define a weak symbol.
-// We prefer __cxa_atexit as it is always defined as a C symbol whileas atexit
-// may not be created via objcopy yet. Also, for glibc, atexit is provided via
-// libc_nonshared.a rather than libc.so. So, it is may not be made ready for
-// overlay builds.
-extern "C" [[gnu::weak]] int __cxa_atexit(void (*callback)(void *),
-                                          void *payload, void *);
 
 namespace LIBC_NAMESPACE_DECL {
-
-constexpr static size_t MAX_AUXV_ENTRIES = 64;
-
-// Helper to recover or set errno
-class AuxvErrnoGuard {
-public:
-  AuxvErrnoGuard() : saved(libc_errno), failure(false) {}
-  ~AuxvErrnoGuard() { libc_errno = failure ? ENOENT : saved; }
-  void mark_failure() { failure = true; }
-
-private:
-  int saved;
-  bool failure;
-};
-
-// Helper to manage the memory
-static AuxEntry *auxv = nullptr;
-
-class AuxvMMapGuard {
-public:
-  constexpr static size_t AUXV_MMAP_SIZE = sizeof(AuxEntry) * MAX_AUXV_ENTRIES;
-
-  AuxvMMapGuard()
-      : ptr(LIBC_NAMESPACE::mmap(nullptr, AUXV_MMAP_SIZE,
-                                 PROT_READ | PROT_WRITE,
-                                 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0)) {}
-  ~AuxvMMapGuard() {
-    if (ptr != MAP_FAILED)
-      LIBC_NAMESPACE::munmap(ptr, AUXV_MMAP_SIZE);
-  }
-  void submit_to_global() {
-    // atexit may fail, we do not set it to global in that case.
-    int ret = __cxa_atexit(
-        [](void *) {
-          LIBC_NAMESPACE::munmap(auxv, AUXV_MMAP_SIZE);
-          auxv = nullptr;
-        },
-        nullptr, nullptr);
-
-    if (ret != 0)
-      return;
-
-    auxv = reinterpret_cast<AuxEntry *>(ptr);
-    ptr = MAP_FAILED;
-  }
-  bool allocated() const { return ptr != MAP_FAILED; }
-  void *get() const { return ptr; }
-
-private:
-  void *ptr;
-};
-
-class AuxvFdGuard {
-public:
-  AuxvFdGuard() {
-    auto result = internal::open("/proc/self/auxv", O_RDONLY | O_CLOEXEC);
-    if (!result.has_value())
-      fd = -1;
-
-    fd = result.value();
-  }
-  ~AuxvFdGuard() {
-    if (fd != -1)
-      internal::close(fd);
-  }
-  bool valid() const { return fd != -1; }
-  int get() const { return fd; }
-
-private:
-  int fd;
-};
-
-static void initialize_auxv_once(void) {
-  // If we cannot get atexit, we cannot register the cleanup function.
-  if (&__cxa_atexit == nullptr)
-    return;
-
-  AuxvMMapGuard mmap_guard;
-  if (!mmap_guard.allocated())
-    return;
-  auto *ptr = reinterpret_cast<AuxEntry *>(mmap_guard.get());
-
-  // We get one less than the max size to make sure the search always
-  // terminates. MMAP private pages are zeroed out already.
-  size_t available_size = AuxvMMapGuard::AUXV_MMAP_SIZE - sizeof(AuxEntryType);
-  // PR_GET_AUXV is only available on Linux kernel 6.1 and above. If this is not
-  // defined, we direcly fall back to reading /proc/self/auxv. In case the libc
-  // is compiled and run on separate kernels, we also check the return value of
-  // prctl.
-#ifdef PR_GET_AUXV
-  int ret = prctl(PR_GET_AUXV, reinterpret_cast<unsigned long>(ptr),
-                  available_size, 0, 0);
-  if (ret >= 0) {
-    mmap_guard.submit_to_global();
-    return;
-  }
-#endif
-  AuxvFdGuard fd_guard;
-  if (!fd_guard.valid())
-    return;
-  auto *buf = reinterpret_cast<char *>(ptr);
-  libc_errno = 0;
-  bool error_detected = false;
-  // Read until we use up all the available space or we finish reading the file.
-  while (available_size != 0) {
-    ssize_t bytes_read =
-        LIBC_NAMESPACE::read(fd_guard.get(), buf, available_size);
-    if (bytes_read <= 0) {
-      if (libc_errno == EINTR)
-        continue;
-      // Now, we either have an non-recoverable error or we have reached the end
-      // of the file. Mark `error_detected` accordingly.
-      if (bytes_read == -1)
-        error_detected = true;
-      break;
-    }
-    buf += bytes_read;
-    available_size -= bytes_read;
-  }
-  // If we get out of the loop without an error, the auxv is ready.
-  if (!error_detected)
-    mmap_guard.submit_to_global();
-}
-
-static AuxEntry read_entry(int fd) {
-  AuxEntry buf;
-  size_t size = sizeof(AuxEntry);
-  char *ptr = reinterpret_cast<char *>(&buf);
-  while (size > 0) {
-    ssize_t ret = LIBC_NAMESPACE::read(fd, ptr, size);
-    if (ret < 0) {
-      if (libc_errno == EINTR)
-        continue;
-      // Error detected, return AT_NULL
-      buf.id = AT_NULL;
-      buf.value = AT_NULL;
-      break;
-    }
-    ptr += ret;
-    size -= ret;
-  }
-  return buf;
-}
-
 LLVM_LIBC_FUNCTION(unsigned long, getauxval, (unsigned long id)) {
-  // Fast path when libc is loaded by its own initialization code. In this case,
-  // app.auxv_ptr is already set to the auxv passed on the initial stack of the
-  // process.
-  AuxvErrnoGuard errno_guard;
-
-  auto search_auxv = [&errno_guard](AuxEntry *auxv,
-                                    unsigned long id) -> AuxEntryType {
-    for (auto *ptr = auxv; ptr->id != AT_NULL; ptr++)
-      if (ptr->id == id)
-        return ptr->value;
-
-    errno_guard.mark_failure();
-    return AT_NULL;
-  };
-
-  // App is a weak symbol that is only defined if libc is linked to its own
-  // initialization routine. We need to check if it is null.
-  if (&app != nullptr)
-    return search_auxv(app.auxv_ptr, id);
-
-  static FutexWordType once_flag;
-  LIBC_NAMESPACE::callonce(reinterpret_cast<CallOnceFlag *>(&once_flag),
-                           initialize_auxv_once);
-  if (auxv != nullptr)
-    return search_auxv(auxv, id);
-
-  // Fallback to use read without mmap
-  AuxvFdGuard fd_guard;
-  if (fd_guard.valid()) {
-    while (true) {
-      AuxEntry buf = read_entry(fd_guard.get());
-      if (buf.id == AT_NULL)
-        break;
-      if (buf.id == id)
-        return buf.value;
-    }
-  }
-
-  // cannot find the entry after all methods, mark failure and return 0
-  errno_guard.mark_failure();
-  return AT_NULL;
+  if (cpp::optional<unsigned long> val = auxv::get(id))
+    return *val;
+  libc_errno = ENOENT;
+  return 0;
 }
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/threads/call_once.cpp b/libc/src/threads/call_once.cpp
index 8466cd62..b60a20d 100644
--- a/libc/src/threads/call_once.cpp
+++ b/libc/src/threads/call_once.cpp
@@ -17,8 +17,7 @@ namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(void, call_once,
                    (once_flag * flag, __call_once_func_t func)) {
-  callonce(reinterpret_cast<CallOnceFlag *>(flag),
-           reinterpret_cast<CallOnceCallback *>(func));
+  callonce(reinterpret_cast<CallOnceFlag *>(flag), func);
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/startup/gpu/amdgpu/start.cpp b/libc/startup/gpu/amdgpu/start.cpp
index 8bd0c3a..bf6c96e 100644
--- a/libc/startup/gpu/amdgpu/start.cpp
+++ b/libc/startup/gpu/amdgpu/start.cpp
@@ -14,6 +14,7 @@
 #include "src/stdlib/exit.h"
 
 extern "C" int main(int argc, char **argv, char **envp);
+extern "C" void __cxa_finalize(void *dso);
 
 namespace LIBC_NAMESPACE_DECL {
 
@@ -68,9 +69,8 @@ _start(int argc, char **argv, char **envp, int *ret) {
 extern "C" [[gnu::visibility("protected"), clang::amdgpu_kernel,
              clang::amdgpu_flat_work_group_size(1, 1),
              clang::amdgpu_max_num_work_groups(1)]] void
-_end(int retval) {
-  // Only a single thread should call `exit` here, the rest should gracefully
-  // return from the kernel. This is so only one thread calls the destructors
-  // registred with 'atexit' above.
-  LIBC_NAMESPACE::exit(retval);
+_end() {
+  // Only a single thread should call the destructors registred with 'atexit'.
+  // The loader utility will handle the actual exit and return code cleanly.
+  __cxa_finalize(nullptr);
 }
diff --git a/libc/startup/gpu/nvptx/start.cpp b/libc/startup/gpu/nvptx/start.cpp
index bc529b3..3dd85fd 100644
--- a/libc/startup/gpu/nvptx/start.cpp
+++ b/libc/startup/gpu/nvptx/start.cpp
@@ -14,6 +14,7 @@
 #include "src/stdlib/exit.h"
 
 extern "C" int main(int argc, char **argv, char **envp);
+extern "C" void __cxa_finalize(void *dso);
 
 namespace LIBC_NAMESPACE_DECL {
 
@@ -70,9 +71,8 @@ _start(int argc, char **argv, char **envp, int *ret) {
   __atomic_fetch_or(ret, main(argc, argv, envp), __ATOMIC_RELAXED);
 }
 
-extern "C" [[gnu::visibility("protected"), clang::nvptx_kernel]] void
-_end(int retval) {
-  // To finis the execution we invoke all the callbacks registered via 'atexit'
-  // and then exit with the appropriate return value.
-  LIBC_NAMESPACE::exit(retval);
+extern "C" [[gnu::visibility("protected"), clang::nvptx_kernel]] void _end() {
+  // Only a single thread should call the destructors registred with 'atexit'.
+  // The loader utility will handle the actual exit and return code cleanly.
+  __cxa_finalize(nullptr);
 }
diff --git a/libc/startup/linux/CMakeLists.txt b/libc/startup/linux/CMakeLists.txt
index 7af1819..df2c4b9 100644
--- a/libc/startup/linux/CMakeLists.txt
+++ b/libc/startup/linux/CMakeLists.txt
@@ -105,6 +105,7 @@ add_object_library(
     libc.src.stdlib.exit
     libc.src.stdlib.atexit
     libc.src.unistd.environ
+    libc.src.__support.OSUtil.linux.auxv
   COMPILE_OPTIONS
     -ffreestanding       # To avoid compiler warnings about calling the main function.
     -fno-builtin         # avoid emit unexpected calls
diff --git a/libc/startup/linux/do_start.cpp b/libc/startup/linux/do_start.cpp
index 94c4ec7..a67bf18 100644
--- a/libc/startup/linux/do_start.cpp
+++ b/libc/startup/linux/do_start.cpp
@@ -9,6 +9,7 @@
 #include "config/linux/app.h"
 #include "hdr/stdint_proxy.h"
 #include "include/llvm-libc-macros/link-macros.h"
+#include "src/__support/OSUtil/linux/auxv.h"
 #include "src/__support/OSUtil/syscall.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/threads/thread.h"
@@ -88,17 +89,19 @@ void teardown_main_tls() { cleanup_tls(tls.addr, tls.size); }
   // denoted by an AT_NULL entry.
   ElfW(Phdr) *program_hdr_table = nullptr;
   uintptr_t program_hdr_count = 0;
-  app.auxv_ptr = reinterpret_cast<AuxEntry *>(env_end_marker + 1);
-  for (auto *aux_entry = app.auxv_ptr; aux_entry->id != AT_NULL; ++aux_entry) {
-    switch (aux_entry->id) {
+  auxv::Vector::initialize_unsafe(
+      reinterpret_cast<const auxv::Entry *>(env_end_marker + 1));
+  auxv::Vector auxvec;
+  for (const auto &aux_entry : auxvec) {
+    switch (aux_entry.type) {
     case AT_PHDR:
-      program_hdr_table = reinterpret_cast<ElfW(Phdr) *>(aux_entry->value);
+      program_hdr_table = reinterpret_cast<ElfW(Phdr) *>(aux_entry.val);
       break;
     case AT_PHNUM:
-      program_hdr_count = aux_entry->value;
+      program_hdr_count = aux_entry.val;
       break;
     case AT_PAGESZ:
-      app.page_size = aux_entry->value;
+      app.page_size = aux_entry.val;
       break;
     default:
       break; // TODO: Read other useful entries from the aux vector.
diff --git a/lldb/source/Core/IOHandler.cpp b/lldb/source/Core/IOHandler.cpp
index 57819ee..c2530aa 100644
--- a/lldb/source/Core/IOHandler.cpp
+++ b/lldb/source/Core/IOHandler.cpp
@@ -152,15 +152,16 @@ void IOHandlerConfirm::IOHandlerComplete(IOHandler &io_handler,
 
 void IOHandlerConfirm::IOHandlerInputComplete(IOHandler &io_handler,
                                               std::string &line) {
-  if (line.empty()) {
+  const llvm::StringRef input = llvm::StringRef(line).rtrim();
+  if (input.empty()) {
     // User just hit enter, set the response to the default
     m_user_response = m_default_response;
     io_handler.SetIsDone(true);
     return;
   }
 
-  if (line.size() == 1) {
-    switch (line[0]) {
+  if (input.size() == 1) {
+    switch (input[0]) {
     case 'y':
     case 'Y':
       m_user_response = true;
@@ -176,10 +177,10 @@ void IOHandlerConfirm::IOHandlerInputComplete(IOHandler &io_handler,
     }
   }
 
-  if (line == "yes" || line == "YES" || line == "Yes") {
+  if (input.equals_insensitive("yes")) {
     m_user_response = true;
     io_handler.SetIsDone(true);
-  } else if (line == "no" || line == "NO" || line == "No") {
+  } else if (input.equals_insensitive("no")) {
     m_user_response = false;
     io_handler.SetIsDone(true);
   }
diff --git a/lldb/test/API/driver/quit_speed/TestQuitWithProcess.py b/lldb/test/API/driver/quit_speed/TestQuitWithProcess.py
index 2412b29..305e3cc 100644
--- a/lldb/test/API/driver/quit_speed/TestQuitWithProcess.py
+++ b/lldb/test/API/driver/quit_speed/TestQuitWithProcess.py
@@ -33,3 +33,28 @@ class DriverQuitSpeedTest(PExpectTest):
         child.sendline("quit")
         print("sent quit")
         child.expect(pexpect.EOF, timeout=15)
+
+    @skipIfAsan
+    def test_run_quit_with_prompt(self):
+        """Test that the lldb driver's batch mode works correctly with trailing space in confimation."""
+        import pexpect
+
+        self.build()
+
+        exe = self.getBuildArtifact("a.out")
+
+        self.launch(executable=exe)
+        child = self.child
+
+        # Launch the process without a TTY so we don't have to interrupt:
+        child.sendline("process launch -n")
+        print("launched process")
+        child.expect(r"Process ([\d]*) launched:")
+        print("Got launch message")
+        child.sendline("quit")
+        print("sent quit")
+
+        child.expect(r".*LLDB will kill one or more processes.*")
+        # add trailing space to the confirmation.
+        child.sendline("yEs ")
+        child.expect(pexpect.EOF, timeout=15)
diff --git a/lldb/test/API/functionalities/unwind/cortex-m-exception/TestCortexMExceptionUnwind.py b/lldb/test/API/functionalities/unwind/cortex-m-exception/TestCortexMExceptionUnwind.py
index 6535f89..768dd6f 100644
--- a/lldb/test/API/functionalities/unwind/cortex-m-exception/TestCortexMExceptionUnwind.py
+++ b/lldb/test/API/functionalities/unwind/cortex-m-exception/TestCortexMExceptionUnwind.py
@@ -12,6 +12,21 @@ from lldbsuite.test import lldbutil
 class TestCortexMExceptionUnwind(TestBase):
     NO_DEBUG_INFO_TESTCASE = True
 
+    # on the lldb-remote-linux-ubuntu CI, the binary.json's triple of
+    # armv7m-apple is not being set in the Target triple, and we're
+    # picking the wrong ABI plugin, ABISysV_arm.
+    # ABISysV_arm::CreateDefaultUnwindPlan() doesn't have a way to detect
+    # arm/thumb for a stack frame, or even the Target's triple for a
+    # Cortex-M part that is always thumb.  It hardcodes r11 as the frame
+    # pointer register, which is correct for arm code but not thumb.
+    # It is never correct # on a Cortex-M target.
+    # The Darwin ABIMacOSX_arm diverges from AAPCS and always uses r7 for
+    # the frame pointer -- the thumb convention -- whether executing arm or
+    # thumb.  So its CreateDefaultUnwindPlan picks the correct register for
+    # the frame pointer, and we can walk the stack.
+    # ABISysV_arm::CreateDefaultUnwindPlan will only get one frame and
+    # not be able to continue.
+    @skipIfRemote
     def test_no_fpu(self):
         """Test that we can backtrace correctly through an ARM Cortex-M Exception return stack"""
 
@@ -44,9 +59,10 @@ class TestCortexMExceptionUnwind(TestBase):
         # frames above that.  The topmost two stack frames
         # were not interesting for this test, so I didn't
         # create symbols for them.
-        self.assertEqual(thread.GetNumFrames(), 3)
+        self.assertEqual(thread.GetNumFrames(), 6)
         stackframe_names = [
             "exception_catcher",
+            "exception_catcher",
             "exception_thrower",
             "main",
         ]
diff --git a/lldb/test/API/functionalities/unwind/cortex-m-exception/armv7m-nofpu-exception.yaml b/lldb/test/API/functionalities/unwind/cortex-m-exception/armv7m-nofpu-exception.yaml
index 0b4e1f8..9ce5ff4 100644
--- a/lldb/test/API/functionalities/unwind/cortex-m-exception/armv7m-nofpu-exception.yaml
+++ b/lldb/test/API/functionalities/unwind/cortex-m-exception/armv7m-nofpu-exception.yaml
@@ -2,8 +2,8 @@ cpu: armv7m
 threads:
   - regsets:
       - flavor: gpr
-        registers: [{name: sp, value: 0x2000fe88}, {name: r7, value: 0x2000fe88}, 
-                    {name: pc, value: 0x00203916}, {name: lr, value: 0x0020392d}]
+        registers: [{name: sp, value: 0x2000fe70}, {name: r7, value: 0x2000fe80}, 
+                    {name: pc, value: 0x0020392c}, {name: lr, value: 0x0020392d}]
 memory-regions:
   # stack memory fetched via 
   # (lldb) p/x $sp
@@ -14,7 +14,7 @@ memory-regions:
       0x0000002a, 0x20010e58, 0x00203923, 0x00000001,
       0x2000fe88, 0x00203911, 0x2000ffdc, 0xfffffff9,
       0x00000102, 0x00000002, 0x000003f0, 0x0000002a,
-      0x20012620, 0x00203215, 0x00202a92, 0x81000200,
+      0x20012620, 0x00203215, 0x00203366, 0x81000200,
       0x00203215, 0x200128b0, 0x0024928d, 0x2000fecc,
       0x002491ed, 0x20010e58, 0x20010e4c, 0x2000ffa0,
       0x200107a0, 0x0000003c, 0x200116e8, 0x200108b0,
@@ -62,26 +62,3 @@ memory-regions:
       0x98, 0xae, 0x28, 0x00
     ]
 
-  # exception_thrower
-  # (lldb) disass -b -c 12 -n exception_thrower
-  # 0x202a88 <+0>:  0xb5f0       push   {r4, r5, r6, r7, lr}
-  # 0x202a8a <+2>:  0xaf03       add    r7, sp, #0xc
-  # 0x202a8c <+4>:  0xe92d0f00   push.w {r8, r9, r10, r11}
-  # 0x202a90 <+8>:  0xb0c3       sub    sp, #0x10c
-  # 0x202a92 <+10>: 0xf7ffffd9   bl     0x202a48                 
-  - addr: 0x202a88
-    UInt8: [
-      0xf0, 0xb5, 0x03, 0xaf, 0x2d, 0xe9, 0x00, 0x0f,
-      0xc3, 0xb0, 0xff, 0xf7, 0xd9, 0xff, 0xff, 0xf7
-    ]
-
-  # main:
-  # 0x202a7e <+0>: push   {r7, lr}
-  # 0x202a80 <+2>: mov    r7, sp
-  # 0x202a82 <+4>: bl     0x202a88                  ; exception_thrower
-  # 0x202a86 <+8>: nop
-  - addr: 0x202a7e
-    UInt8: [
-      0x80, 0xb5, 0x6f, 0x46, 0x00, 0xf0, 0x01, 0xf8,
-      0x00, 0xbf
-    ]
diff --git a/lldb/test/API/functionalities/unwind/cortex-m-exception/binary.json b/lldb/test/API/functionalities/unwind/cortex-m-exception/binary.json
index 0de0169..8fcd530 100644
--- a/lldb/test/API/functionalities/unwind/cortex-m-exception/binary.json
+++ b/lldb/test/API/functionalities/unwind/cortex-m-exception/binary.json
@@ -1,5 +1,5 @@
 {
-  "triple": "armv7m--",
+  "triple": "armv7m-apple",
   "uuid": "2D157DBA-53C9-3AC7-B5A1-9D336EC831CB",
   "type": "executable",
   "sections": [
@@ -28,13 +28,13 @@
     {
       "name": "exception_catcher",
       "type": "code",
-      "size": 32,
+      "size": 44,
       "address": 2111760
     },
     {
       "name": "exception_thrower",
       "type": "code",
-      "size": 16,
+      "size": 2652,
       "address": 2108040
     }
   ]
diff --git a/llvm/docs/CFIVerify.rst b/llvm/docs/CFIVerify.rst
index 6403347..f766be1 100644
--- a/llvm/docs/CFIVerify.rst
+++ b/llvm/docs/CFIVerify.rst
@@ -10,7 +10,7 @@ Objective
 
 This document provides an overview of an external tool to verify the protection
 mechanisms implemented by Clang's *Control Flow Integrity* (CFI) schemes
-(``-fsanitize=cfi``). This tool, provided a binary or DSO, should infer whether
+(``-fsanitize=cfi``). This tool, given a binary or DSO, should infer whether
 indirect control flow operations are protected by CFI, and should output these
 results in a human-readable form.
 
@@ -22,12 +22,12 @@ Location
 ========
 
 This tool will be present as a part of the LLVM toolchain, and will reside in
-the "/llvm/tools/llvm-cfi-verify" directory, relative to the LLVM trunk. It will
+the ``/llvm/tools/llvm-cfi-verify`` directory, relative to the LLVM trunk. It will
 be tested in two methods:
 
 - Unit tests to validate code sections, present in
-  "/llvm/unittests/tools/llvm-cfi-verify".
-- Integration tests, present in "/llvm/tools/clang/test/LLVMCFIVerify". These
+  ``/llvm/unittests/tools/llvm-cfi-verify``.
+- Integration tests, present in ``/llvm/tools/clang/test/LLVMCFIVerify``. These
   integration tests are part of clang as part of a continuous integration
   framework, ensuring updates to the compiler that reduce CFI coverage on
   indirect control flow instructions are identified.
@@ -38,16 +38,16 @@ Background
 This tool will continuously validate that CFI directives are properly
 implemented around all indirect control flows by analysing the output machine
 code. The analysis of machine code is important as it ensures that any bugs
-present in linker or compiler do not subvert CFI protections in the final
+present in the linker or compiler do not subvert CFI protections in the final
 shipped binary.
 
 Unprotected indirect control flow instructions will be flagged for manual
-review. These unexpected control flows may simply have not been accounted for in
-the compiler implementation of CFI (e.g. indirect jumps to facilitate switch
+review. These unexpected control flows may not have been accounted for in
+the compiler implementation of CFI (e.g., indirect jumps to facilitate switch
 statements may not be fully protected).
 
 It may be possible in the future to extend this tool to flag unnecessary CFI
-directives (e.g. CFI directives around a static call to a non-polymorphic base
+directives (e.g., CFI directives around a static call to a non-polymorphic base
 type). This type of directive has no security implications, but may present
 performance impacts.
 
@@ -66,7 +66,7 @@ the disassembly. A control flow graph would be generated from a small buffer of
 the instructions surrounding the 'target' control flow instruction. If the
 target instruction is branched-to, the fallthrough of the branch should be the
 CFI trap (on x86, this is a ``ud2`` instruction). If the target instruction is
-the fallthrough (i.e. immediately succeeds) of a conditional jump, the
+the fallthrough (i.e., immediately succeeds) of a conditional jump, the
 conditional jump target should be the CFI trap. If an indirect control flow
 instruction does not conform to one of these formats, the target will be noted
 as being CFI-unprotected.
@@ -76,7 +76,7 @@ fallthrough of a conditional jump), if the target represents a vcall that takes
 arguments, these arguments may be pushed to the stack after the branch but
 before the target instruction. In these cases, a secondary 'spill graph' in
 constructed, to ensure the register argument used by the indirect jump/call is
-not spilled from the stack at any point in the interim period. If there are no
+not spilled from the stack at any point in the interim. If there are no
 spills that affect the target register, the target is marked as CFI-protected.
 
 Other Design Notes
diff --git a/llvm/include/llvm/ADT/Bitset.h b/llvm/include/llvm/ADT/Bitset.h
index ecb6b14..b1e539e 100644
--- a/llvm/include/llvm/ADT/Bitset.h
+++ b/llvm/include/llvm/ADT/Bitset.h
@@ -28,15 +28,15 @@ namespace llvm {
 /// initialization.
 template <unsigned NumBits>
 class Bitset {
-  typedef uintptr_t BitWord;
+  using BitWord = uintptr_t;
 
-  enum { BITWORD_SIZE = (unsigned)sizeof(BitWord) * CHAR_BIT };
+  static constexpr unsigned BitwordBits = sizeof(BitWord) * CHAR_BIT;
 
-  static_assert(BITWORD_SIZE == 64 || BITWORD_SIZE == 32,
+  static_assert(BitwordBits == 64 || BitwordBits == 32,
                 "Unsupported word size");
 
   static constexpr unsigned NumWords =
-      (NumBits + BITWORD_SIZE - 1) / BITWORD_SIZE;
+      (NumBits + BitwordBits - 1) / BitwordBits;
 
 protected:
   using StorageType = std::array<BitWord, NumWords>;
@@ -60,23 +60,23 @@ public:
   }
 
   constexpr Bitset &set(unsigned I) {
-    Bits[I / BITWORD_SIZE] |= BitWord(1) << (I % BITWORD_SIZE);
+    Bits[I / BitwordBits] |= BitWord(1) << (I % BitwordBits);
     return *this;
   }
 
   constexpr Bitset &reset(unsigned I) {
-    Bits[I / BITWORD_SIZE] &= ~(BitWord(1) << (I % BITWORD_SIZE));
+    Bits[I / BitwordBits] &= ~(BitWord(1) << (I % BitwordBits));
     return *this;
   }
 
   constexpr Bitset &flip(unsigned I) {
-    Bits[I / BITWORD_SIZE] ^= BitWord(1) << (I % BITWORD_SIZE);
+    Bits[I / BitwordBits] ^= BitWord(1) << (I % BitwordBits);
     return *this;
   }
 
   constexpr bool operator[](unsigned I) const {
-    BitWord Mask = BitWord(1) << (I % BITWORD_SIZE);
-    return (Bits[I / BITWORD_SIZE] & Mask) != 0;
+    BitWord Mask = BitWord(1) << (I % BitwordBits);
+    return (Bits[I / BitwordBits] & Mask) != 0;
   }
 
   constexpr bool test(unsigned I) const { return (*this)[I]; }
diff --git a/llvm/include/llvm/Analysis/ScalarEvolution.h b/llvm/include/llvm/Analysis/ScalarEvolution.h
index 858c1d5..8876e4e 100644
--- a/llvm/include/llvm/Analysis/ScalarEvolution.h
+++ b/llvm/include/llvm/Analysis/ScalarEvolution.h
@@ -1002,10 +1002,14 @@ public:
   /// (at every loop iteration).  It is, at the same time, the minimum number
   /// of times S is divisible by 2.  For example, given {4,+,8} it returns 2.
   /// If S is guaranteed to be 0, it returns the bitwidth of S.
-  LLVM_ABI uint32_t getMinTrailingZeros(const SCEV *S);
+  /// If \p CtxI is not nullptr, return a constant multiple valid at \p CtxI.
+  LLVM_ABI uint32_t getMinTrailingZeros(const SCEV *S,
+                                        const Instruction *CtxI = nullptr);
 
-  /// Returns the max constant multiple of S.
-  LLVM_ABI APInt getConstantMultiple(const SCEV *S);
+  /// Returns the max constant multiple of S. If \p CtxI is not nullptr, return
+  /// a constant multiple valid at \p CtxI.
+  LLVM_ABI APInt getConstantMultiple(const SCEV *S,
+                                     const Instruction *CtxI = nullptr);
 
   // Returns the max constant multiple of S. If S is exactly 0, return 1.
   LLVM_ABI APInt getNonZeroConstantMultiple(const SCEV *S);
@@ -1525,8 +1529,10 @@ private:
   /// Return the Value set from which the SCEV expr is generated.
   ArrayRef<Value *> getSCEVValues(const SCEV *S);
 
-  /// Private helper method for the getConstantMultiple method.
-  APInt getConstantMultipleImpl(const SCEV *S);
+  /// Private helper method for the getConstantMultiple method. If \p CtxI is
+  /// not nullptr, return a constant multiple valid at \p CtxI.
+  APInt getConstantMultipleImpl(const SCEV *S,
+                                const Instruction *Ctx = nullptr);
 
   /// Information about the number of times a particular loop exit may be
   /// reached before exiting the loop.
diff --git a/llvm/include/llvm/BinaryFormat/Dwarf.h b/llvm/include/llvm/BinaryFormat/Dwarf.h
index 2c50125..815e85d 100644
--- a/llvm/include/llvm/BinaryFormat/Dwarf.h
+++ b/llvm/include/llvm/BinaryFormat/Dwarf.h
@@ -500,8 +500,15 @@ toDW_LNAME(SourceLanguage language) {
   return {};
 }
 
+/// Returns a version-independent language name.
 LLVM_ABI llvm::StringRef LanguageDescription(SourceLanguageName name);
 
+/// Returns a language name corresponding to the specified version.
+/// If the version is not recognized for the specified language, returns
+/// the version-independent name.
+LLVM_ABI llvm::StringRef LanguageDescription(SourceLanguageName Name,
+                                             uint32_t Version);
+
 inline bool isCPlusPlus(SourceLanguage S) {
   bool result = false;
   // Deliberately enumerate all the language options so we get a warning when
@@ -997,6 +1004,7 @@ LLVM_ABI StringRef VisibilityString(unsigned Visibility);
 LLVM_ABI StringRef VirtualityString(unsigned Virtuality);
 LLVM_ABI StringRef EnumKindString(unsigned EnumKind);
 LLVM_ABI StringRef LanguageString(unsigned Language);
+LLVM_ABI StringRef SourceLanguageNameString(SourceLanguageName Lang);
 LLVM_ABI StringRef CaseString(unsigned Case);
 LLVM_ABI StringRef ConventionString(unsigned Convention);
 LLVM_ABI StringRef InlineCodeString(unsigned Code);
@@ -1038,6 +1046,7 @@ LLVM_ABI unsigned getSubOperationEncoding(unsigned OpEncoding,
 LLVM_ABI unsigned getVirtuality(StringRef VirtualityString);
 LLVM_ABI unsigned getEnumKind(StringRef EnumKindString);
 LLVM_ABI unsigned getLanguage(StringRef LanguageString);
+LLVM_ABI unsigned getSourceLanguageName(StringRef SourceLanguageNameString);
 LLVM_ABI unsigned getCallingConvention(StringRef LanguageString);
 LLVM_ABI unsigned getAttributeEncoding(StringRef EncodingString);
 LLVM_ABI unsigned getMacinfo(StringRef MacinfoString);
diff --git a/llvm/include/llvm/Frontend/HLSL/RootSignatureMetadata.h b/llvm/include/llvm/Frontend/HLSL/RootSignatureMetadata.h
index bfcbf72..7ef6667 100644
--- a/llvm/include/llvm/Frontend/HLSL/RootSignatureMetadata.h
+++ b/llvm/include/llvm/Frontend/HLSL/RootSignatureMetadata.h
@@ -27,160 +27,15 @@ class Metadata;
 
 namespace hlsl {
 namespace rootsig {
-
-template <typename T>
 class RootSignatureValidationError
-    : public ErrorInfo<RootSignatureValidationError<T>> {
-public:
-  static char ID;
-  StringRef ParamName;
-  T Value;
-
-  RootSignatureValidationError(StringRef ParamName, T Value)
-      : ParamName(ParamName), Value(Value) {}
-
-  void log(raw_ostream &OS) const override {
-    OS << "Invalid value for " << ParamName << ": " << Value;
-  }
-
-  std::error_code convertToErrorCode() const override {
-    return llvm::inconvertibleErrorCode();
-  }
-};
-
-class OffsetAppendAfterOverflow : public ErrorInfo<OffsetAppendAfterOverflow> {
-public:
-  static char ID;
-  dxil::ResourceClass Type;
-  uint32_t Register;
-  uint32_t Space;
-
-  OffsetAppendAfterOverflow(dxil::ResourceClass Type, uint32_t Register,
-                            uint32_t Space)
-      : Type(Type), Register(Register), Space(Space) {}
-
-  void log(raw_ostream &OS) const override {
-    OS << "Range " << getResourceClassName(Type) << "(register=" << Register
-       << ", space=" << Space << ") "
-       << "cannot be appended after an unbounded range ";
-  }
-
-  std::error_code convertToErrorCode() const override {
-    return llvm::inconvertibleErrorCode();
-  }
-};
-
-class ShaderRegisterOverflowError
-    : public ErrorInfo<ShaderRegisterOverflowError> {
-public:
-  static char ID;
-  dxil::ResourceClass Type;
-  uint32_t Register;
-  uint32_t Space;
-
-  ShaderRegisterOverflowError(dxil::ResourceClass Type, uint32_t Register,
-                              uint32_t Space)
-      : Type(Type), Register(Register), Space(Space) {}
-
-  void log(raw_ostream &OS) const override {
-    OS << "Overflow for shader register range: " << getResourceClassName(Type)
-       << "(register=" << Register << ", space=" << Space << ").";
-  }
-
-  std::error_code convertToErrorCode() const override {
-    return llvm::inconvertibleErrorCode();
-  }
-};
-
-class OffsetOverflowError : public ErrorInfo<OffsetOverflowError> {
-public:
-  static char ID;
-  dxil::ResourceClass Type;
-  uint32_t Register;
-  uint32_t Space;
-
-  OffsetOverflowError(dxil::ResourceClass Type, uint32_t Register,
-                      uint32_t Space)
-      : Type(Type), Register(Register), Space(Space) {}
-
-  void log(raw_ostream &OS) const override {
-    OS << "Offset overflow for descriptor range: " << getResourceClassName(Type)
-       << "(register=" << Register << ", space=" << Space << ").";
-  }
-
-  std::error_code convertToErrorCode() const override {
-    return llvm::inconvertibleErrorCode();
-  }
-};
-
-class TableSamplerMixinError : public ErrorInfo<TableSamplerMixinError> {
+    : public ErrorInfo<RootSignatureValidationError> {
 public:
   static char ID;
-  dxil::ResourceClass Type;
-  uint32_t Location;
-
-  TableSamplerMixinError(dxil::ResourceClass Type, uint32_t Location)
-      : Type(Type), Location(Location) {}
-
-  void log(raw_ostream &OS) const override {
-    OS << "Samplers cannot be mixed with other "
-       << "resource types in a descriptor table, " << getResourceClassName(Type)
-       << "(location=" << Location << ")";
-  }
-
-  std::error_code convertToErrorCode() const override {
-    return llvm::inconvertibleErrorCode();
-  }
-};
-
-class GenericRSMetadataError : public ErrorInfo<GenericRSMetadataError> {
-public:
-  LLVM_ABI static char ID;
-  StringRef Message;
-  MDNode *MD;
-
-  GenericRSMetadataError(StringRef Message, MDNode *MD)
-      : Message(Message), MD(MD) {}
-
-  void log(raw_ostream &OS) const override {
-    OS << Message;
-    if (MD) {
-      OS << "\n";
-      MD->printTree(OS);
-    }
-  }
-
-  std::error_code convertToErrorCode() const override {
-    return llvm::inconvertibleErrorCode();
-  }
-};
-
-class InvalidRSMetadataFormat : public ErrorInfo<InvalidRSMetadataFormat> {
-public:
-  LLVM_ABI static char ID;
-  StringRef ElementName;
+  std::string Msg;
 
-  InvalidRSMetadataFormat(StringRef ElementName) : ElementName(ElementName) {}
-
-  void log(raw_ostream &OS) const override {
-    OS << "Invalid format for  " << ElementName;
-  }
+  RootSignatureValidationError(const Twine &Msg) : Msg(Msg.str()) {}
 
-  std::error_code convertToErrorCode() const override {
-    return llvm::inconvertibleErrorCode();
-  }
-};
-
-class InvalidRSMetadataValue : public ErrorInfo<InvalidRSMetadataValue> {
-public:
-  LLVM_ABI static char ID;
-  StringRef ParamName;
-
-  InvalidRSMetadataValue(StringRef ParamName) : ParamName(ParamName) {}
-
-  void log(raw_ostream &OS) const override {
-    OS << "Invalid value for " << ParamName;
-  }
+  void log(raw_ostream &OS) const override { OS << Msg; }
 
   std::error_code convertToErrorCode() const override {
     return llvm::inconvertibleErrorCode();
diff --git a/llvm/include/llvm/IR/DIBuilder.h b/llvm/include/llvm/IR/DIBuilder.h
index 25cbc38..6529412 100644
--- a/llvm/include/llvm/IR/DIBuilder.h
+++ b/llvm/include/llvm/IR/DIBuilder.h
@@ -146,9 +146,9 @@ namespace llvm {
     /// \param SDK           The SDK name. On Darwin, this is the last component
     ///                      of the sysroot.
     LLVM_ABI DICompileUnit *
-    createCompileUnit(unsigned Lang, DIFile *File, StringRef Producer,
-                      bool isOptimized, StringRef Flags, unsigned RV,
-                      StringRef SplitName = StringRef(),
+    createCompileUnit(DISourceLanguageName Lang, DIFile *File,
+                      StringRef Producer, bool isOptimized, StringRef Flags,
+                      unsigned RV, StringRef SplitName = StringRef(),
                       DICompileUnit::DebugEmissionKind Kind =
                           DICompileUnit::DebugEmissionKind::FullDebug,
                       uint64_t DWOId = 0, bool SplitDebugInlining = true,
diff --git a/llvm/include/llvm/IR/DebugInfoMetadata.h b/llvm/include/llvm/IR/DebugInfoMetadata.h
index 7c6e709..c626efc 100644
--- a/llvm/include/llvm/IR/DebugInfoMetadata.h
+++ b/llvm/include/llvm/IR/DebugInfoMetadata.h
@@ -66,6 +66,55 @@ namespace dwarf {
 enum Tag : uint16_t;
 }
 
+/// Wrapper structure that holds a language name and its version.
+///
+/// Some debug-info formats, particularly DWARF, distniguish between
+/// language codes that include the version name and codes that don't.
+/// DISourceLanguageName may hold either of these.
+///
+class DISourceLanguageName {
+  /// Language version. The version scheme is language
+  /// dependent.
+  uint32_t Version = 0;
+
+  /// Language name.
+  /// If \ref HasVersion is \c true, then this name
+  /// is version independent (i.e., doesn't include the language
+  /// version in its name).
+  uint16_t Name;
+
+  /// If \c true, then \ref Version is interpretable and \ref Name
+  /// is a version independent name.
+  bool HasVersion;
+
+public:
+  bool hasVersionedName() const { return HasVersion; }
+
+  /// Returns a versioned or unversioned language name.
+  uint16_t getName() const { return Name; }
+
+  /// Transitional API for cases where we do not yet support
+  /// versioned source language names. Use \ref getName instead.
+  ///
+  /// FIXME: remove once all callers of this API account for versioned
+  /// names.
+  uint16_t getUnversionedName() const {
+    assert(!hasVersionedName());
+    return Name;
+  }
+
+  /// Returns language version. Only valid for versioned language names.
+  uint32_t getVersion() const {
+    assert(hasVersionedName());
+    return Version;
+  }
+
+  DISourceLanguageName(uint16_t Lang, uint32_t Version)
+      : Version(Version), Name(Lang), HasVersion(true) {};
+  DISourceLanguageName(uint16_t Lang)
+      : Version(0), Name(Lang), HasVersion(false) {};
+};
+
 class DbgVariableRecord;
 
 LLVM_ABI extern cl::opt<bool> EnableFSDiscriminator;
@@ -2003,7 +2052,7 @@ public:
   LLVM_ABI static const char *nameTableKindString(DebugNameTableKind PK);
 
 private:
-  unsigned SourceLanguage;
+  DISourceLanguageName SourceLanguage;
   unsigned RuntimeVersion;
   uint64_t DWOId;
   unsigned EmissionKind;
@@ -2013,16 +2062,17 @@ private:
   bool DebugInfoForProfiling;
   bool RangesBaseAddress;
 
-  DICompileUnit(LLVMContext &C, StorageType Storage, unsigned SourceLanguage,
-                bool IsOptimized, unsigned RuntimeVersion,
-                unsigned EmissionKind, uint64_t DWOId, bool SplitDebugInlining,
-                bool DebugInfoForProfiling, unsigned NameTableKind,
-                bool RangesBaseAddress, ArrayRef<Metadata *> Ops);
+  DICompileUnit(LLVMContext &C, StorageType Storage,
+                DISourceLanguageName SourceLanguage, bool IsOptimized,
+                unsigned RuntimeVersion, unsigned EmissionKind, uint64_t DWOId,
+                bool SplitDebugInlining, bool DebugInfoForProfiling,
+                unsigned NameTableKind, bool RangesBaseAddress,
+                ArrayRef<Metadata *> Ops);
   ~DICompileUnit() = default;
 
   static DICompileUnit *
-  getImpl(LLVMContext &Context, unsigned SourceLanguage, DIFile *File,
-          StringRef Producer, bool IsOptimized, StringRef Flags,
+  getImpl(LLVMContext &Context, DISourceLanguageName SourceLanguage,
+          DIFile *File, StringRef Producer, bool IsOptimized, StringRef Flags,
           unsigned RuntimeVersion, StringRef SplitDebugFilename,
           unsigned EmissionKind, DICompositeTypeArray EnumTypes,
           DIScopeArray RetainedTypes,
@@ -2042,8 +2092,8 @@ private:
         getCanonicalMDString(Context, SDK), Storage, ShouldCreate);
   }
   LLVM_ABI static DICompileUnit *
-  getImpl(LLVMContext &Context, unsigned SourceLanguage, Metadata *File,
-          MDString *Producer, bool IsOptimized, MDString *Flags,
+  getImpl(LLVMContext &Context, DISourceLanguageName SourceLanguage,
+          Metadata *File, MDString *Producer, bool IsOptimized, MDString *Flags,
           unsigned RuntimeVersion, MDString *SplitDebugFilename,
           unsigned EmissionKind, Metadata *EnumTypes, Metadata *RetainedTypes,
           Metadata *GlobalVariables, Metadata *ImportedEntities,
@@ -2068,7 +2118,7 @@ public:
 
   DEFINE_MDNODE_GET_DISTINCT_TEMPORARY(
       DICompileUnit,
-      (unsigned SourceLanguage, DIFile *File, StringRef Producer,
+      (DISourceLanguageName SourceLanguage, DIFile *File, StringRef Producer,
        bool IsOptimized, StringRef Flags, unsigned RuntimeVersion,
        StringRef SplitDebugFilename, DebugEmissionKind EmissionKind,
        DICompositeTypeArray EnumTypes, DIScopeArray RetainedTypes,
@@ -2084,7 +2134,7 @@ public:
        SysRoot, SDK))
   DEFINE_MDNODE_GET_DISTINCT_TEMPORARY(
       DICompileUnit,
-      (unsigned SourceLanguage, Metadata *File, MDString *Producer,
+      (DISourceLanguageName SourceLanguage, Metadata *File, MDString *Producer,
        bool IsOptimized, MDString *Flags, unsigned RuntimeVersion,
        MDString *SplitDebugFilename, unsigned EmissionKind, Metadata *EnumTypes,
        Metadata *RetainedTypes, Metadata *GlobalVariables,
@@ -2099,7 +2149,7 @@ public:
 
   TempDICompileUnit clone() const { return cloneImpl(); }
 
-  unsigned getSourceLanguage() const { return SourceLanguage; }
+  DISourceLanguageName getSourceLanguage() const { return SourceLanguage; }
   bool isOptimized() const { return IsOptimized; }
   unsigned getRuntimeVersion() const { return RuntimeVersion; }
   DebugEmissionKind getEmissionKind() const {
diff --git a/llvm/include/llvm/Support/SpecialCaseList.h b/llvm/include/llvm/Support/SpecialCaseList.h
index 64cad80..d8dd1c4 100644
--- a/llvm/include/llvm/Support/SpecialCaseList.h
+++ b/llvm/include/llvm/Support/SpecialCaseList.h
@@ -156,6 +156,7 @@ protected:
 
     std::vector<std::unique_ptr<Matcher::Glob>> Globs;
     std::vector<std::unique_ptr<Reg>> RegExes;
+    bool RemoveDotSlash = false;
   };
 
   using SectionEntries = StringMap<StringMap<Matcher>>;
diff --git a/llvm/include/llvm/Transforms/Utils/SimplifyCFGOptions.h b/llvm/include/llvm/Transforms/Utils/SimplifyCFGOptions.h
index ee3cc95..2d0f957 100644
--- a/llvm/include/llvm/Transforms/Utils/SimplifyCFGOptions.h
+++ b/llvm/include/llvm/Transforms/Utils/SimplifyCFGOptions.h
@@ -24,6 +24,7 @@ struct SimplifyCFGOptions {
   int BonusInstThreshold = 1;
   bool ForwardSwitchCondToPhi = false;
   bool ConvertSwitchRangeToICmp = false;
+  bool ConvertSwitchToArithmetic = false;
   bool ConvertSwitchToLookupTable = false;
   bool NeedCanonicalLoop = true;
   bool HoistCommonInsts = false;
@@ -48,6 +49,10 @@ struct SimplifyCFGOptions {
     ConvertSwitchRangeToICmp = B;
     return *this;
   }
+  SimplifyCFGOptions &convertSwitchToArithmetic(bool B) {
+    ConvertSwitchToArithmetic = B;
+    return *this;
+  }
   SimplifyCFGOptions &convertSwitchToLookupTable(bool B) {
     ConvertSwitchToLookupTable = B;
     return *this;
diff --git a/llvm/lib/Analysis/ModuleDebugInfoPrinter.cpp b/llvm/lib/Analysis/ModuleDebugInfoPrinter.cpp
index 0fbf082..f31d625 100644
--- a/llvm/lib/Analysis/ModuleDebugInfoPrinter.cpp
+++ b/llvm/lib/Analysis/ModuleDebugInfoPrinter.cpp
@@ -43,11 +43,13 @@ static void printModuleDebugInfo(raw_ostream &O, const Module *M,
   // filenames), so just print a few useful things.
   for (DICompileUnit *CU : Finder.compile_units()) {
     O << "Compile unit: ";
-    auto Lang = dwarf::LanguageString(CU->getSourceLanguage());
+    auto Lang =
+        dwarf::LanguageString(CU->getSourceLanguage().getUnversionedName());
     if (!Lang.empty())
       O << Lang;
     else
-      O << "unknown-language(" << CU->getSourceLanguage() << ")";
+      O << "unknown-language(" << CU->getSourceLanguage().getUnversionedName()
+        << ")";
     printFile(O, CU->getFilename(), CU->getDirectory());
     O << '\n';
   }
diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp
index 63e1b14..6f6776c 100644
--- a/llvm/lib/Analysis/ScalarEvolution.cpp
+++ b/llvm/lib/Analysis/ScalarEvolution.cpp
@@ -6351,19 +6351,20 @@ const SCEV *ScalarEvolution::createNodeForGEP(GEPOperator *GEP) {
   return getGEPExpr(GEP, IndexExprs);
 }
 
-APInt ScalarEvolution::getConstantMultipleImpl(const SCEV *S) {
+APInt ScalarEvolution::getConstantMultipleImpl(const SCEV *S,
+                                               const Instruction *CtxI) {
   uint64_t BitWidth = getTypeSizeInBits(S->getType());
   auto GetShiftedByZeros = [BitWidth](uint32_t TrailingZeros) {
     return TrailingZeros >= BitWidth
                ? APInt::getZero(BitWidth)
                : APInt::getOneBitSet(BitWidth, TrailingZeros);
   };
-  auto GetGCDMultiple = [this](const SCEVNAryExpr *N) {
+  auto GetGCDMultiple = [this, CtxI](const SCEVNAryExpr *N) {
     // The result is GCD of all operands results.
-    APInt Res = getConstantMultiple(N->getOperand(0));
+    APInt Res = getConstantMultiple(N->getOperand(0), CtxI);
     for (unsigned I = 1, E = N->getNumOperands(); I < E && Res != 1; ++I)
       Res = APIntOps::GreatestCommonDivisor(
-          Res, getConstantMultiple(N->getOperand(I)));
+          Res, getConstantMultiple(N->getOperand(I), CtxI));
     return Res;
   };
 
@@ -6371,33 +6372,33 @@ APInt ScalarEvolution::getConstantMultipleImpl(const SCEV *S) {
   case scConstant:
     return cast<SCEVConstant>(S)->getAPInt();
   case scPtrToInt:
-    return getConstantMultiple(cast<SCEVPtrToIntExpr>(S)->getOperand());
+    return getConstantMultiple(cast<SCEVPtrToIntExpr>(S)->getOperand(), CtxI);
   case scUDivExpr:
   case scVScale:
     return APInt(BitWidth, 1);
   case scTruncate: {
     // Only multiples that are a power of 2 will hold after truncation.
     const SCEVTruncateExpr *T = cast<SCEVTruncateExpr>(S);
-    uint32_t TZ = getMinTrailingZeros(T->getOperand());
+    uint32_t TZ = getMinTrailingZeros(T->getOperand(), CtxI);
     return GetShiftedByZeros(TZ);
   }
   case scZeroExtend: {
     const SCEVZeroExtendExpr *Z = cast<SCEVZeroExtendExpr>(S);
-    return getConstantMultiple(Z->getOperand()).zext(BitWidth);
+    return getConstantMultiple(Z->getOperand(), CtxI).zext(BitWidth);
   }
   case scSignExtend: {
     // Only multiples that are a power of 2 will hold after sext.
     const SCEVSignExtendExpr *E = cast<SCEVSignExtendExpr>(S);
-    uint32_t TZ = getMinTrailingZeros(E->getOperand());
+    uint32_t TZ = getMinTrailingZeros(E->getOperand(), CtxI);
     return GetShiftedByZeros(TZ);
   }
   case scMulExpr: {
     const SCEVMulExpr *M = cast<SCEVMulExpr>(S);
     if (M->hasNoUnsignedWrap()) {
       // The result is the product of all operand results.
-      APInt Res = getConstantMultiple(M->getOperand(0));
+      APInt Res = getConstantMultiple(M->getOperand(0), CtxI);
       for (const SCEV *Operand : M->operands().drop_front())
-        Res = Res * getConstantMultiple(Operand);
+        Res = Res * getConstantMultiple(Operand, CtxI);
       return Res;
     }
 
@@ -6405,7 +6406,7 @@ APInt ScalarEvolution::getConstantMultipleImpl(const SCEV *S) {
     // sum of trailing zeros for all its operands.
     uint32_t TZ = 0;
     for (const SCEV *Operand : M->operands())
-      TZ += getMinTrailingZeros(Operand);
+      TZ += getMinTrailingZeros(Operand, CtxI);
     return GetShiftedByZeros(TZ);
   }
   case scAddExpr:
@@ -6414,9 +6415,9 @@ APInt ScalarEvolution::getConstantMultipleImpl(const SCEV *S) {
     if (N->hasNoUnsignedWrap())
         return GetGCDMultiple(N);
     // Find the trailing bits, which is the minimum of its operands.
-    uint32_t TZ = getMinTrailingZeros(N->getOperand(0));
+    uint32_t TZ = getMinTrailingZeros(N->getOperand(0), CtxI);
     for (const SCEV *Operand : N->operands().drop_front())
-      TZ = std::min(TZ, getMinTrailingZeros(Operand));
+      TZ = std::min(TZ, getMinTrailingZeros(Operand, CtxI));
     return GetShiftedByZeros(TZ);
   }
   case scUMaxExpr:
@@ -6429,7 +6430,7 @@ APInt ScalarEvolution::getConstantMultipleImpl(const SCEV *S) {
     // ask ValueTracking for known bits
     const SCEVUnknown *U = cast<SCEVUnknown>(S);
     unsigned Known =
-        computeKnownBits(U->getValue(), getDataLayout(), &AC, nullptr, &DT)
+        computeKnownBits(U->getValue(), getDataLayout(), &AC, CtxI, &DT)
             .countMinTrailingZeros();
     return GetShiftedByZeros(Known);
   }
@@ -6439,12 +6440,18 @@ APInt ScalarEvolution::getConstantMultipleImpl(const SCEV *S) {
   llvm_unreachable("Unknown SCEV kind!");
 }
 
-APInt ScalarEvolution::getConstantMultiple(const SCEV *S) {
+APInt ScalarEvolution::getConstantMultiple(const SCEV *S,
+                                           const Instruction *CtxI) {
+  // Skip looking up and updating the cache if there is a context instruction,
+  // as the result will only be valid in the specified context.
+  if (CtxI)
+    return getConstantMultipleImpl(S, CtxI);
+
   auto I = ConstantMultipleCache.find(S);
   if (I != ConstantMultipleCache.end())
     return I->second;
 
-  APInt Result = getConstantMultipleImpl(S);
+  APInt Result = getConstantMultipleImpl(S, CtxI);
   auto InsertPair = ConstantMultipleCache.insert({S, Result});
   assert(InsertPair.second && "Should insert a new key");
   return InsertPair.first->second;
@@ -6455,8 +6462,9 @@ APInt ScalarEvolution::getNonZeroConstantMultiple(const SCEV *S) {
   return Multiple == 0 ? APInt(Multiple.getBitWidth(), 1) : Multiple;
 }
 
-uint32_t ScalarEvolution::getMinTrailingZeros(const SCEV *S) {
-  return std::min(getConstantMultiple(S).countTrailingZeros(),
+uint32_t ScalarEvolution::getMinTrailingZeros(const SCEV *S,
+                                              const Instruction *CtxI) {
+  return std::min(getConstantMultiple(S, CtxI).countTrailingZeros(),
                   (unsigned)getTypeSizeInBits(S->getType()));
 }
 
@@ -10243,8 +10251,7 @@ const SCEV *ScalarEvolution::stripInjectiveFunctions(const SCEV *S) const {
 static const SCEV *
 SolveLinEquationWithOverflow(const APInt &A, const SCEV *B,
                              SmallVectorImpl<const SCEVPredicate *> *Predicates,
-
-                             ScalarEvolution &SE) {
+                             ScalarEvolution &SE, const Loop *L) {
   uint32_t BW = A.getBitWidth();
   assert(BW == SE.getTypeSizeInBits(B->getType()));
   assert(A != 0 && "A must be non-zero.");
@@ -10260,7 +10267,12 @@ SolveLinEquationWithOverflow(const APInt &A, const SCEV *B,
   //
   // B is divisible by D if and only if the multiplicity of prime factor 2 for B
   // is not less than multiplicity of this prime factor for D.
-  if (SE.getMinTrailingZeros(B) < Mult2) {
+  unsigned MinTZ = SE.getMinTrailingZeros(B);
+  // Try again with the terminator of the loop predecessor for context-specific
+  // result, if MinTZ s too small.
+  if (MinTZ < Mult2 && L->getLoopPredecessor())
+    MinTZ = SE.getMinTrailingZeros(B, L->getLoopPredecessor()->getTerminator());
+  if (MinTZ < Mult2) {
     // Check if we can prove there's no remainder using URem.
     const SCEV *URem =
         SE.getURemExpr(B, SE.getConstant(APInt::getOneBitSet(BW, Mult2)));
@@ -10708,7 +10720,7 @@ ScalarEvolution::ExitLimit ScalarEvolution::howFarToZero(const SCEV *V,
     return getCouldNotCompute();
   const SCEV *E = SolveLinEquationWithOverflow(
       StepC->getAPInt(), getNegativeSCEV(Start),
-      AllowPredicates ? &Predicates : nullptr, *this);
+      AllowPredicates ? &Predicates : nullptr, *this, L);
 
   const SCEV *M = E;
   if (E != getCouldNotCompute()) {
diff --git a/llvm/lib/AsmParser/LLParser.cpp b/llvm/lib/AsmParser/LLParser.cpp
index 897e679..5589966 100644
--- a/llvm/lib/AsmParser/LLParser.cpp
+++ b/llvm/lib/AsmParser/LLParser.cpp
@@ -5861,11 +5861,11 @@ bool LLParser::parseDICompileUnit(MDNode *&Result, bool IsDistinct) {
 #undef VISIT_MD_FIELDS
 
   Result = DICompileUnit::getDistinct(
-      Context, language.Val, file.Val, producer.Val, isOptimized.Val, flags.Val,
-      runtimeVersion.Val, splitDebugFilename.Val, emissionKind.Val, enums.Val,
-      retainedTypes.Val, globals.Val, imports.Val, macros.Val, dwoId.Val,
-      splitDebugInlining.Val, debugInfoForProfiling.Val, nameTableKind.Val,
-      rangesBaseAddress.Val, sysroot.Val, sdk.Val);
+      Context, DISourceLanguageName(language.Val), file.Val, producer.Val,
+      isOptimized.Val, flags.Val, runtimeVersion.Val, splitDebugFilename.Val,
+      emissionKind.Val, enums.Val, retainedTypes.Val, globals.Val, imports.Val,
+      macros.Val, dwoId.Val, splitDebugInlining.Val, debugInfoForProfiling.Val,
+      nameTableKind.Val, rangesBaseAddress.Val, sysroot.Val, sdk.Val);
   return false;
 }
 
diff --git a/llvm/lib/BinaryFormat/Dwarf.cpp b/llvm/lib/BinaryFormat/Dwarf.cpp
index 8b24044..969047a 100644
--- a/llvm/lib/BinaryFormat/Dwarf.cpp
+++ b/llvm/lib/BinaryFormat/Dwarf.cpp
@@ -472,6 +472,137 @@ StringRef llvm::dwarf::LanguageDescription(dwarf::SourceLanguageName lname) {
   return "Unknown";
 }
 
+StringRef llvm::dwarf::LanguageDescription(dwarf::SourceLanguageName Name,
+                                           uint32_t Version) {
+  switch (Name) {
+  // YYYY
+  case DW_LNAME_Ada: {
+    if (Version <= 1983)
+      return "Ada 83";
+    if (Version <= 1995)
+      return "Ada 95";
+    if (Version <= 2005)
+      return "Ada 2005";
+    if (Version <= 2012)
+      return "Ada 2012";
+  } break;
+
+  case DW_LNAME_Cobol: {
+    if (Version <= 1974)
+      return "COBOL-74";
+    if (Version <= 1985)
+      return "COBOL-85";
+  } break;
+
+  case DW_LNAME_Fortran: {
+    if (Version <= 1977)
+      return "FORTRAN 77";
+    if (Version <= 1990)
+      return "FORTRAN 90";
+    if (Version <= 1995)
+      return "Fortran 95";
+    if (Version <= 2003)
+      return "Fortran 2003";
+    if (Version <= 2008)
+      return "Fortran 2008";
+    if (Version <= 2018)
+      return "Fortran 2018";
+  } break;
+
+  // YYYYMM
+  case DW_LNAME_C: {
+    if (Version == 0)
+      break;
+    if (Version <= 198912)
+      return "C89";
+    if (Version <= 199901)
+      return "C99";
+    if (Version <= 201112)
+      return "C11";
+    if (Version <= 201710)
+      return "C17";
+  } break;
+
+  case DW_LNAME_C_plus_plus: {
+    if (Version == 0)
+      break;
+    if (Version <= 199711)
+      return "C++98";
+    if (Version <= 200310)
+      return "C++03";
+    if (Version <= 201103)
+      return "C++11";
+    if (Version <= 201402)
+      return "C++14";
+    if (Version <= 201703)
+      return "C++17";
+    if (Version <= 202002)
+      return "C++20";
+  } break;
+
+  case DW_LNAME_ObjC_plus_plus:
+  case DW_LNAME_ObjC:
+  case DW_LNAME_Move:
+  case DW_LNAME_SYCL:
+  case DW_LNAME_BLISS:
+  case DW_LNAME_Crystal:
+  case DW_LNAME_D:
+  case DW_LNAME_Dylan:
+  case DW_LNAME_Go:
+  case DW_LNAME_Haskell:
+  case DW_LNAME_HLSL:
+  case DW_LNAME_Java:
+  case DW_LNAME_Julia:
+  case DW_LNAME_Kotlin:
+  case DW_LNAME_Modula2:
+  case DW_LNAME_Modula3:
+  case DW_LNAME_OCaml:
+  case DW_LNAME_OpenCL_C:
+  case DW_LNAME_Pascal:
+  case DW_LNAME_PLI:
+  case DW_LNAME_Python:
+  case DW_LNAME_RenderScript:
+  case DW_LNAME_Rust:
+  case DW_LNAME_Swift:
+  case DW_LNAME_UPC:
+  case DW_LNAME_Zig:
+  case DW_LNAME_Assembly:
+  case DW_LNAME_C_sharp:
+  case DW_LNAME_Mojo:
+  case DW_LNAME_GLSL:
+  case DW_LNAME_GLSL_ES:
+  case DW_LNAME_OpenCL_CPP:
+  case DW_LNAME_CPP_for_OpenCL:
+  case DW_LNAME_Ruby:
+  case DW_LNAME_Hylo:
+  case DW_LNAME_Metal:
+    break;
+  }
+
+  // Fallback to un-versioned name.
+  return LanguageDescription(Name);
+}
+
+llvm::StringRef llvm::dwarf::SourceLanguageNameString(SourceLanguageName Lang) {
+  switch (Lang) {
+#define HANDLE_DW_LNAME(ID, NAME, DESC, LOWER_BOUND)                           \
+  case DW_LNAME_##NAME:                                                        \
+    return "DW_LNAME_" #NAME;
+#include "llvm/BinaryFormat/Dwarf.def"
+  }
+
+  return {};
+}
+
+unsigned
+llvm::dwarf::getSourceLanguageName(StringRef SourceLanguageNameString) {
+  return StringSwitch<unsigned>(SourceLanguageNameString)
+#define HANDLE_DW_LNAME(ID, NAME, DESC, LOWER_BOUND)                           \
+  .Case("DW_LNAME_" #NAME, DW_LNAME_##NAME)
+#include "llvm/BinaryFormat/Dwarf.def"
+      .Default(0);
+}
+
 StringRef llvm::dwarf::CaseString(unsigned Case) {
   switch (Case) {
   case DW_ID_case_sensitive:
diff --git a/llvm/lib/Bitcode/Reader/MetadataLoader.cpp b/llvm/lib/Bitcode/Reader/MetadataLoader.cpp
index 22c7fa5..a4d1b83 100644
--- a/llvm/lib/Bitcode/Reader/MetadataLoader.cpp
+++ b/llvm/lib/Bitcode/Reader/MetadataLoader.cpp
@@ -1866,11 +1866,13 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
     // Ignore Record[0], which indicates whether this compile unit is
     // distinct.  It's always distinct.
     IsDistinct = true;
+
     auto *CU = DICompileUnit::getDistinct(
-        Context, Record[1], getMDOrNull(Record[2]), getMDString(Record[3]),
-        Record[4], getMDString(Record[5]), Record[6], getMDString(Record[7]),
-        Record[8], getMDOrNull(Record[9]), getMDOrNull(Record[10]),
-        getMDOrNull(Record[12]), getMDOrNull(Record[13]),
+        Context, DISourceLanguageName(Record[1]), getMDOrNull(Record[2]),
+        getMDString(Record[3]), Record[4], getMDString(Record[5]), Record[6],
+        getMDString(Record[7]), Record[8], getMDOrNull(Record[9]),
+        getMDOrNull(Record[10]), getMDOrNull(Record[12]),
+        getMDOrNull(Record[13]),
         Record.size() <= 15 ? nullptr : getMDOrNull(Record[15]),
         Record.size() <= 14 ? 0 : Record[14],
         Record.size() <= 16 ? true : Record[16],
diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
index 6d86809..7ed140d 100644
--- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
+++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
@@ -2107,7 +2107,8 @@ void ModuleBitcodeWriter::writeDICompileUnit(const DICompileUnit *N,
                                              unsigned Abbrev) {
   assert(N->isDistinct() && "Expected distinct compile units");
   Record.push_back(/* IsDistinct */ true);
-  Record.push_back(N->getSourceLanguage());
+
+  Record.push_back(N->getSourceLanguage().getUnversionedName());
   Record.push_back(VE.getMetadataOrNullID(N->getFile()));
   Record.push_back(VE.getMetadataOrNullID(N->getRawProducer()));
   Record.push_back(N->isOptimized());
diff --git a/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp b/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
index c5d6e40..12d749c 100644
--- a/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
@@ -633,8 +633,8 @@ void CodeViewDebug::beginModule(Module *M) {
     Node = *CUs->operands().begin();
   }
   const auto *CU = cast<DICompileUnit>(Node);
-
-  CurrentSourceLanguage = MapDWLangToCVLang(CU->getSourceLanguage());
+  CurrentSourceLanguage =
+      MapDWLangToCVLang(CU->getSourceLanguage().getUnversionedName());
   if (!M->getCodeViewFlag() ||
       CU->getEmissionKind() == DICompileUnit::NoDebug) {
     Asm = nullptr;
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
index 09d5f9c..d751a7f 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
@@ -1040,7 +1040,8 @@ void DwarfDebug::finishUnitAttributes(const DICompileUnit *DIUnit,
     NewCU.addString(Die, dwarf::DW_AT_producer, Producer);
 
   NewCU.addUInt(Die, dwarf::DW_AT_language, dwarf::DW_FORM_data2,
-                DIUnit->getSourceLanguage());
+                DIUnit->getSourceLanguage().getUnversionedName());
+
   NewCU.addString(Die, dwarf::DW_AT_name, FN);
   StringRef SysRoot = DIUnit->getSysRoot();
   if (!SysRoot.empty())
@@ -2930,10 +2931,9 @@ static dwarf::PubIndexEntryDescriptor computeIndexValue(DwarfUnit *CU,
   case dwarf::DW_TAG_union_type:
   case dwarf::DW_TAG_enumeration_type:
     return dwarf::PubIndexEntryDescriptor(
-        dwarf::GIEK_TYPE,
-        dwarf::isCPlusPlus((dwarf::SourceLanguage)CU->getLanguage())
-            ? dwarf::GIEL_EXTERNAL
-            : dwarf::GIEL_STATIC);
+        dwarf::GIEK_TYPE, dwarf::isCPlusPlus(CU->getSourceLanguage())
+                              ? dwarf::GIEL_EXTERNAL
+                              : dwarf::GIEL_STATIC);
   case dwarf::DW_TAG_typedef:
   case dwarf::DW_TAG_base_type:
   case dwarf::DW_TAG_subrange_type:
@@ -3926,7 +3926,7 @@ void DwarfDebug::addDwarfTypeUnitType(DwarfCompileUnit &CU,
   TypeUnitsUnderConstruction.emplace_back(std::move(OwnedUnit), CTy);
 
   NewTU.addUInt(UnitDie, dwarf::DW_AT_language, dwarf::DW_FORM_data2,
-                CU.getLanguage());
+                CU.getSourceLanguage());
 
   uint64_t Signature = makeTypeSignature(Identifier);
   NewTU.setTypeSignature(Signature);
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
index 3cfe7cc..aa078f3 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
@@ -100,7 +100,7 @@ DwarfUnit::~DwarfUnit() {
 }
 
 int64_t DwarfUnit::getDefaultLowerBound() const {
-  switch (getLanguage()) {
+  switch (getSourceLanguage()) {
   default:
     break;
 
@@ -704,12 +704,17 @@ void DwarfUnit::addType(DIE &Entity, const DIType *Ty,
   addDIEEntry(Entity, Attribute, DIEEntry(*getOrCreateTypeDIE(Ty)));
 }
 
+llvm::dwarf::SourceLanguage DwarfUnit::getSourceLanguage() const {
+  return static_cast<llvm::dwarf::SourceLanguage>(
+      getLanguage().getUnversionedName());
+}
+
 std::string DwarfUnit::getParentContextString(const DIScope *Context) const {
   if (!Context)
     return "";
 
   // FIXME: Decide whether to implement this for non-C++ languages.
-  if (!dwarf::isCPlusPlus((dwarf::SourceLanguage)getLanguage()))
+  if (!dwarf::isCPlusPlus(getSourceLanguage()))
     return "";
 
   std::string CS;
@@ -940,7 +945,7 @@ void DwarfUnit::constructTypeDIE(DIE &Buffer, const DISubroutineType *CTy) {
 
   // Add prototype flag if we're dealing with a C language and the function has
   // been prototyped.
-  if (isPrototyped && dwarf::isC((dwarf::SourceLanguage)getLanguage()))
+  if (isPrototyped && dwarf::isC(getSourceLanguage()))
     addFlag(Buffer, dwarf::DW_AT_prototyped);
 
   // Add a DW_AT_calling_convention if this has an explicit convention.
@@ -1448,7 +1453,7 @@ void DwarfUnit::applySubprogramAttributes(const DISubprogram *SP, DIE &SPDie,
 
   // Add the prototype if we have a prototype and we have a C like
   // language.
-  if (SP->isPrototyped() && dwarf::isC((dwarf::SourceLanguage)getLanguage()))
+  if (SP->isPrototyped() && dwarf::isC(getSourceLanguage()))
     addFlag(SPDie, dwarf::DW_AT_prototyped);
 
   if (SP->isObjCDirect())
@@ -1700,8 +1705,7 @@ DIE *DwarfUnit::getIndexTyDie() {
   addString(*IndexTyDie, dwarf::DW_AT_name, Name);
   addUInt(*IndexTyDie, dwarf::DW_AT_byte_size, std::nullopt, sizeof(int64_t));
   addUInt(*IndexTyDie, dwarf::DW_AT_encoding, dwarf::DW_FORM_data1,
-          dwarf::getArrayIndexTypeEncoding(
-              (dwarf::SourceLanguage)getLanguage()));
+          dwarf::getArrayIndexTypeEncoding(getSourceLanguage()));
   DD->addAccelType(*this, CUNode->getNameTableKind(), Name, *IndexTyDie,
                    /*Flags*/ 0);
   return IndexTyDie;
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h
index bb00ec3..9288d7e 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h
@@ -17,6 +17,7 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/CodeGen/AsmPrinter.h"
 #include "llvm/CodeGen/DIE.h"
+#include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/Target/TargetMachine.h"
 #include <optional>
 #include <string>
@@ -107,7 +108,7 @@ public:
     return LabelBegin;
   }
   MCSymbol *getEndLabel() const { return EndLabel; }
-  uint16_t getLanguage() const { return CUNode->getSourceLanguage(); }
+  llvm::dwarf::SourceLanguage getSourceLanguage() const;
   const DICompileUnit *getCUNode() const { return CUNode; }
   DwarfDebug &getDwarfDebug() const { return *DD; }
 
@@ -358,6 +359,10 @@ protected:
   }
 
 private:
+  DISourceLanguageName getLanguage() const {
+    return CUNode->getSourceLanguage();
+  }
+
   /// A helper to add a wide integer constant to a DIE using a block
   /// form.
   void addIntAsBlock(DIE &Die, dwarf::Attribute Attribute, const APInt &Val);
diff --git a/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp b/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp
index 851d445..507b2d6 100644
--- a/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp
@@ -1843,7 +1843,8 @@ bool FastISel::selectOperator(const User *I, unsigned Opcode) {
     return selectCast(I, ISD::SINT_TO_FP);
 
   case Instruction::IntToPtr: // Deliberate fall-through.
-  case Instruction::PtrToInt: {
+  case Instruction::PtrToInt:
+  case Instruction::PtrToAddr: {
     EVT SrcVT = TLI.getValueType(DL, I->getOperand(0)->getType());
     EVT DstVT = TLI.getValueType(DL, I->getType());
     if (DstVT.bitsGT(SrcVT))
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
index c35f29d..175753f 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
@@ -571,7 +571,7 @@ bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) {
   SwiftError->setFunction(mf);
   const Function &Fn = mf.getFunction();
 
-  bool InstrRef = mf.shouldUseDebugInstrRef();
+  bool InstrRef = mf.useDebugInstrRef();
 
   FuncInfo->set(MF->getFunction(), *MF, CurDAG);
 
diff --git a/llvm/lib/Frontend/HLSL/RootSignatureMetadata.cpp b/llvm/lib/Frontend/HLSL/RootSignatureMetadata.cpp
index 707f0c3..63189f4 100644
--- a/llvm/lib/Frontend/HLSL/RootSignatureMetadata.cpp
+++ b/llvm/lib/Frontend/HLSL/RootSignatureMetadata.cpp
@@ -24,15 +24,7 @@ namespace llvm {
 namespace hlsl {
 namespace rootsig {
 
-char GenericRSMetadataError::ID;
-char InvalidRSMetadataFormat::ID;
-char InvalidRSMetadataValue::ID;
-char TableSamplerMixinError::ID;
-char ShaderRegisterOverflowError::ID;
-char OffsetOverflowError::ID;
-char OffsetAppendAfterOverflow::ID;
-
-template <typename T> char RootSignatureValidationError<T>::ID;
+char RootSignatureValidationError::ID;
 
 static std::optional<uint32_t> extractMdIntValue(MDNode *Node,
                                                  unsigned int OpId) {
@@ -57,20 +49,6 @@ static std::optional<StringRef> extractMdStringValue(MDNode *Node,
   return NodeText->getString();
 }
 
-template <typename T, typename = std::enable_if_t<
-                          std::is_enum_v<T> &&
-                          std::is_same_v<std::underlying_type_t<T>, uint32_t>>>
-static Expected<T>
-extractEnumValue(MDNode *Node, unsigned int OpId, StringRef ErrText,
-                 llvm::function_ref<bool(uint32_t)> VerifyFn) {
-  if (std::optional<uint32_t> Val = extractMdIntValue(Node, OpId)) {
-    if (!VerifyFn(*Val))
-      return make_error<RootSignatureValidationError<uint32_t>>(ErrText, *Val);
-    return static_cast<T>(*Val);
-  }
-  return make_error<InvalidRSMetadataValue>("ShaderVisibility");
-}
-
 namespace {
 
 // We use the OverloadVisit with std::visit to ensure the compiler catches if a
@@ -81,8 +59,52 @@ template <class... Ts> struct OverloadedVisit : Ts... {
 };
 template <class... Ts> OverloadedVisit(Ts...) -> OverloadedVisit<Ts...>;
 
+struct FmtRange {
+  dxil::ResourceClass Type;
+  uint32_t Register;
+  uint32_t Space;
+
+  FmtRange(const mcdxbc::DescriptorRange &Range)
+      : Type(Range.RangeType), Register(Range.BaseShaderRegister),
+        Space(Range.RegisterSpace) {}
+};
+
+raw_ostream &operator<<(llvm::raw_ostream &OS, const FmtRange &Range) {
+  OS << getResourceClassName(Range.Type) << "(register=" << Range.Register
+     << ", space=" << Range.Space << ")";
+  return OS;
+}
+
+struct FmtMDNode {
+  const MDNode *Node;
+
+  FmtMDNode(const MDNode *Node) : Node(Node) {}
+};
+
+raw_ostream &operator<<(llvm::raw_ostream &OS, FmtMDNode Fmt) {
+  Fmt.Node->printTree(OS);
+  return OS;
+}
+
+static Error makeRSError(const Twine &Msg) {
+  return make_error<RootSignatureValidationError>(Msg);
+}
 } // namespace
 
+template <typename T, typename = std::enable_if_t<
+                          std::is_enum_v<T> &&
+                          std::is_same_v<std::underlying_type_t<T>, uint32_t>>>
+static Expected<T>
+extractEnumValue(MDNode *Node, unsigned int OpId, StringRef ErrText,
+                 llvm::function_ref<bool(uint32_t)> VerifyFn) {
+  if (std::optional<uint32_t> Val = extractMdIntValue(Node, OpId)) {
+    if (!VerifyFn(*Val))
+      return makeRSError(formatv("Invalid value for {0}: {1}", ErrText, Val));
+    return static_cast<T>(*Val);
+  }
+  return makeRSError(formatv("Invalid value for {0}:", ErrText));
+}
+
 MDNode *MetadataBuilder::BuildRootSignature() {
   const auto Visitor = OverloadedVisit{
       [this](const dxbc::RootFlags &Flags) -> MDNode * {
@@ -226,12 +248,12 @@ MDNode *MetadataBuilder::BuildStaticSampler(const StaticSampler &Sampler) {
 Error MetadataParser::parseRootFlags(mcdxbc::RootSignatureDesc &RSD,
                                      MDNode *RootFlagNode) {
   if (RootFlagNode->getNumOperands() != 2)
-    return make_error<InvalidRSMetadataFormat>("RootFlag Element");
+    return makeRSError("Invalid format for RootFlags Element");
 
   if (std::optional<uint32_t> Val = extractMdIntValue(RootFlagNode, 1))
     RSD.Flags = *Val;
   else
-    return make_error<InvalidRSMetadataValue>("RootFlag");
+    return makeRSError("Invalid value for RootFlag");
 
   return Error::success();
 }
@@ -239,7 +261,7 @@ Error MetadataParser::parseRootFlags(mcdxbc::RootSignatureDesc &RSD,
 Error MetadataParser::parseRootConstants(mcdxbc::RootSignatureDesc &RSD,
                                          MDNode *RootConstantNode) {
   if (RootConstantNode->getNumOperands() != 5)
-    return make_error<InvalidRSMetadataFormat>("RootConstants Element");
+    return makeRSError("Invalid format for RootConstants Element");
 
   Expected<dxbc::ShaderVisibility> Visibility =
       extractEnumValue<dxbc::ShaderVisibility>(RootConstantNode, 1,
@@ -252,17 +274,17 @@ Error MetadataParser::parseRootConstants(mcdxbc::RootSignatureDesc &RSD,
   if (std::optional<uint32_t> Val = extractMdIntValue(RootConstantNode, 2))
     Constants.ShaderRegister = *Val;
   else
-    return make_error<InvalidRSMetadataValue>("ShaderRegister");
+    return makeRSError("Invalid value for ShaderRegister");
 
   if (std::optional<uint32_t> Val = extractMdIntValue(RootConstantNode, 3))
     Constants.RegisterSpace = *Val;
   else
-    return make_error<InvalidRSMetadataValue>("RegisterSpace");
+    return makeRSError("Invalid value for RegisterSpace");
 
   if (std::optional<uint32_t> Val = extractMdIntValue(RootConstantNode, 4))
     Constants.Num32BitValues = *Val;
   else
-    return make_error<InvalidRSMetadataValue>("Num32BitValues");
+    return makeRSError("Invalid value for Num32BitValues");
 
   RSD.ParametersContainer.addParameter(dxbc::RootParameterType::Constants32Bit,
                                        *Visibility, Constants);
@@ -279,7 +301,7 @@ Error MetadataParser::parseRootDescriptors(
          "parseRootDescriptors should only be called with RootDescriptor "
          "element kind.");
   if (RootDescriptorNode->getNumOperands() != 5)
-    return make_error<InvalidRSMetadataFormat>("Root Descriptor Element");
+    return makeRSError("Invalid format for Root Descriptor Element");
 
   dxbc::RootParameterType Type;
   switch (ElementKind) {
@@ -308,12 +330,12 @@ Error MetadataParser::parseRootDescriptors(
   if (std::optional<uint32_t> Val = extractMdIntValue(RootDescriptorNode, 2))
     Descriptor.ShaderRegister = *Val;
   else
-    return make_error<InvalidRSMetadataValue>("ShaderRegister");
+    return makeRSError("Invalid value for ShaderRegister");
 
   if (std::optional<uint32_t> Val = extractMdIntValue(RootDescriptorNode, 3))
     Descriptor.RegisterSpace = *Val;
   else
-    return make_error<InvalidRSMetadataValue>("RegisterSpace");
+    return makeRSError("Invalid value for RegisterSpace");
 
   if (RSD.Version == 1) {
     RSD.ParametersContainer.addParameter(Type, *Visibility, Descriptor);
@@ -324,7 +346,7 @@ Error MetadataParser::parseRootDescriptors(
   if (std::optional<uint32_t> Val = extractMdIntValue(RootDescriptorNode, 4))
     Descriptor.Flags = *Val;
   else
-    return make_error<InvalidRSMetadataValue>("Root Descriptor Flags");
+    return makeRSError("Invalid value for Root Descriptor Flags");
 
   RSD.ParametersContainer.addParameter(Type, *Visibility, Descriptor);
   return Error::success();
@@ -333,7 +355,7 @@ Error MetadataParser::parseRootDescriptors(
 Error MetadataParser::parseDescriptorRange(mcdxbc::DescriptorTable &Table,
                                            MDNode *RangeDescriptorNode) {
   if (RangeDescriptorNode->getNumOperands() != 6)
-    return make_error<InvalidRSMetadataFormat>("Descriptor Range");
+    return makeRSError("Invalid format for Descriptor Range");
 
   mcdxbc::DescriptorRange Range;
 
@@ -341,7 +363,7 @@ Error MetadataParser::parseDescriptorRange(mcdxbc::DescriptorTable &Table,
       extractMdStringValue(RangeDescriptorNode, 0);
 
   if (!ElementText.has_value())
-    return make_error<InvalidRSMetadataFormat>("Descriptor Range");
+    return makeRSError("Invalid format for Descriptor Range");
 
   if (*ElementText == "CBV")
     Range.RangeType = dxil::ResourceClass::CBuffer;
@@ -352,35 +374,34 @@ Error MetadataParser::parseDescriptorRange(mcdxbc::DescriptorTable &Table,
   else if (*ElementText == "Sampler")
     Range.RangeType = dxil::ResourceClass::Sampler;
   else
-    return make_error<GenericRSMetadataError>("Invalid Descriptor Range type.",
-                                              RangeDescriptorNode);
+    return makeRSError(formatv("Invalid Descriptor Range type.\n{0}",
+                               FmtMDNode{RangeDescriptorNode}));
 
   if (std::optional<uint32_t> Val = extractMdIntValue(RangeDescriptorNode, 1))
     Range.NumDescriptors = *Val;
   else
-    return make_error<GenericRSMetadataError>("Number of Descriptor in Range",
-                                              RangeDescriptorNode);
+    return makeRSError(formatv("Invalid number of Descriptor in Range.\n{0}",
+                               FmtMDNode{RangeDescriptorNode}));
 
   if (std::optional<uint32_t> Val = extractMdIntValue(RangeDescriptorNode, 2))
     Range.BaseShaderRegister = *Val;
   else
-    return make_error<InvalidRSMetadataValue>("BaseShaderRegister");
+    return makeRSError("Invalid value for BaseShaderRegister");
 
   if (std::optional<uint32_t> Val = extractMdIntValue(RangeDescriptorNode, 3))
     Range.RegisterSpace = *Val;
   else
-    return make_error<InvalidRSMetadataValue>("RegisterSpace");
+    return makeRSError("Invalid value for RegisterSpace");
 
   if (std::optional<uint32_t> Val = extractMdIntValue(RangeDescriptorNode, 4))
     Range.OffsetInDescriptorsFromTableStart = *Val;
   else
-    return make_error<InvalidRSMetadataValue>(
-        "OffsetInDescriptorsFromTableStart");
+    return makeRSError("Invalid value for OffsetInDescriptorsFromTableStart");
 
   if (std::optional<uint32_t> Val = extractMdIntValue(RangeDescriptorNode, 5))
     Range.Flags = *Val;
   else
-    return make_error<InvalidRSMetadataValue>("Descriptor Range Flags");
+    return makeRSError("Invalid value for Descriptor Range Flags");
 
   Table.Ranges.push_back(Range);
   return Error::success();
@@ -390,7 +411,7 @@ Error MetadataParser::parseDescriptorTable(mcdxbc::RootSignatureDesc &RSD,
                                            MDNode *DescriptorTableNode) {
   const unsigned int NumOperands = DescriptorTableNode->getNumOperands();
   if (NumOperands < 2)
-    return make_error<InvalidRSMetadataFormat>("Descriptor Table");
+    return makeRSError("Invalid format for Descriptor Table");
 
   Expected<dxbc::ShaderVisibility> Visibility =
       extractEnumValue<dxbc::ShaderVisibility>(DescriptorTableNode, 1,
@@ -404,8 +425,8 @@ Error MetadataParser::parseDescriptorTable(mcdxbc::RootSignatureDesc &RSD,
   for (unsigned int I = 2; I < NumOperands; I++) {
     MDNode *Element = dyn_cast<MDNode>(DescriptorTableNode->getOperand(I));
     if (Element == nullptr)
-      return make_error<GenericRSMetadataError>(
-          "Missing Root Element Metadata Node.", DescriptorTableNode);
+      return makeRSError(formatv("Missing Root Element Metadata Node.\n{0}",
+                                 FmtMDNode{DescriptorTableNode}));
 
     if (auto Err = parseDescriptorRange(Table, Element))
       return Err;
@@ -419,7 +440,7 @@ Error MetadataParser::parseDescriptorTable(mcdxbc::RootSignatureDesc &RSD,
 Error MetadataParser::parseStaticSampler(mcdxbc::RootSignatureDesc &RSD,
                                          MDNode *StaticSamplerNode) {
   if (StaticSamplerNode->getNumOperands() != 15)
-    return make_error<InvalidRSMetadataFormat>("Static Sampler");
+    return makeRSError("Invalid format for Static Sampler");
 
   mcdxbc::StaticSampler Sampler;
 
@@ -453,12 +474,12 @@ Error MetadataParser::parseStaticSampler(mcdxbc::RootSignatureDesc &RSD,
   if (std::optional<float> Val = extractMdFloatValue(StaticSamplerNode, 5))
     Sampler.MipLODBias = *Val;
   else
-    return make_error<InvalidRSMetadataValue>("MipLODBias");
+    return makeRSError("Invalid value for MipLODBias");
 
   if (std::optional<uint32_t> Val = extractMdIntValue(StaticSamplerNode, 6))
     Sampler.MaxAnisotropy = *Val;
   else
-    return make_error<InvalidRSMetadataValue>("MaxAnisotropy");
+    return makeRSError("Invalid value for MaxAnisotropy");
 
   Expected<dxbc::ComparisonFunc> ComparisonFunc =
       extractEnumValue<dxbc::ComparisonFunc>(
@@ -477,22 +498,22 @@ Error MetadataParser::parseStaticSampler(mcdxbc::RootSignatureDesc &RSD,
   if (std::optional<float> Val = extractMdFloatValue(StaticSamplerNode, 9))
     Sampler.MinLOD = *Val;
   else
-    return make_error<InvalidRSMetadataValue>("MinLOD");
+    return makeRSError("Invalid value for MinLOD");
 
   if (std::optional<float> Val = extractMdFloatValue(StaticSamplerNode, 10))
     Sampler.MaxLOD = *Val;
   else
-    return make_error<InvalidRSMetadataValue>("MaxLOD");
+    return makeRSError("Invalid value for MaxLOD");
 
   if (std::optional<uint32_t> Val = extractMdIntValue(StaticSamplerNode, 11))
     Sampler.ShaderRegister = *Val;
   else
-    return make_error<InvalidRSMetadataValue>("ShaderRegister");
+    return makeRSError("Invalid value for ShaderRegister");
 
   if (std::optional<uint32_t> Val = extractMdIntValue(StaticSamplerNode, 12))
     Sampler.RegisterSpace = *Val;
   else
-    return make_error<InvalidRSMetadataValue>("RegisterSpace");
+    return makeRSError("Invalid value for RegisterSpace");
 
   Expected<dxbc::ShaderVisibility> Visibility =
       extractEnumValue<dxbc::ShaderVisibility>(StaticSamplerNode, 13,
@@ -511,7 +532,7 @@ Error MetadataParser::parseStaticSampler(mcdxbc::RootSignatureDesc &RSD,
   if (std::optional<uint32_t> Val = extractMdIntValue(StaticSamplerNode, 14))
     Sampler.Flags = *Val;
   else
-    return make_error<InvalidRSMetadataValue>("Static Sampler Flags");
+    return makeRSError("Invalid value for Static Sampler Flags");
 
   RSD.StaticSamplers.push_back(Sampler);
   return Error::success();
@@ -521,7 +542,7 @@ Error MetadataParser::parseRootSignatureElement(mcdxbc::RootSignatureDesc &RSD,
                                                 MDNode *Element) {
   std::optional<StringRef> ElementText = extractMdStringValue(Element, 0);
   if (!ElementText.has_value())
-    return make_error<InvalidRSMetadataFormat>("Root Element");
+    return makeRSError("Invalid format for Root Element");
 
   RootSignatureElementKind ElementKind =
       StringSwitch<RootSignatureElementKind>(*ElementText)
@@ -549,8 +570,8 @@ Error MetadataParser::parseRootSignatureElement(mcdxbc::RootSignatureDesc &RSD,
   case RootSignatureElementKind::StaticSamplers:
     return parseStaticSampler(RSD, Element);
   case RootSignatureElementKind::Error:
-    return make_error<GenericRSMetadataError>("Invalid Root Signature Element",
-                                              Element);
+    return makeRSError(
+        formatv("Invalid Root Signature Element\n{0}", FmtMDNode{Element}));
   }
 
   llvm_unreachable("Unhandled RootSignatureElementKind enum.");
@@ -563,7 +584,10 @@ validateDescriptorTableSamplerMixin(const mcdxbc::DescriptorTable &Table,
   for (const mcdxbc::DescriptorRange &Range : Table.Ranges) {
     if (Range.RangeType == dxil::ResourceClass::Sampler &&
         CurrRC != dxil::ResourceClass::Sampler)
-      return make_error<TableSamplerMixinError>(CurrRC, Location);
+      return makeRSError(
+          formatv("Samplers cannot be mixed with other resource types in a "
+                  "descriptor table, {0}(location={1})",
+                  getResourceClassName(CurrRC), Location));
     CurrRC = Range.RangeType;
   }
   return Error::success();
@@ -583,8 +607,8 @@ validateDescriptorTableRegisterOverflow(const mcdxbc::DescriptorTable &Table,
         Range.BaseShaderRegister, Range.NumDescriptors);
 
     if (!verifyNoOverflowedOffset(RangeBound))
-      return make_error<ShaderRegisterOverflowError>(
-          Range.RangeType, Range.BaseShaderRegister, Range.RegisterSpace);
+      return makeRSError(
+          formatv("Overflow for shader register range: {0}", FmtRange{Range}));
 
     bool IsAppending =
         Range.OffsetInDescriptorsFromTableStart == DescriptorTableOffsetAppend;
@@ -592,15 +616,16 @@ validateDescriptorTableRegisterOverflow(const mcdxbc::DescriptorTable &Table,
       Offset = Range.OffsetInDescriptorsFromTableStart;
 
     if (IsPrevUnbound && IsAppending)
-      return make_error<OffsetAppendAfterOverflow>(
-          Range.RangeType, Range.BaseShaderRegister, Range.RegisterSpace);
+      return makeRSError(
+          formatv("Range {0} cannot be appended after an unbounded range",
+                  FmtRange{Range}));
 
     const uint64_t OffsetBound =
         llvm::hlsl::rootsig::computeRangeBound(Offset, Range.NumDescriptors);
 
     if (!verifyNoOverflowedOffset(OffsetBound))
-      return make_error<OffsetOverflowError>(
-          Range.RangeType, Range.BaseShaderRegister, Range.RegisterSpace);
+      return makeRSError(formatv("Offset overflow for descriptor range: {0}.",
+                                 FmtRange{Range}));
 
     Offset = OffsetBound + 1;
     IsPrevUnbound =
@@ -614,17 +639,15 @@ Error MetadataParser::validateRootSignature(
     const mcdxbc::RootSignatureDesc &RSD) {
   Error DeferredErrs = Error::success();
   if (!hlsl::rootsig::verifyVersion(RSD.Version)) {
-    DeferredErrs =
-        joinErrors(std::move(DeferredErrs),
-                   make_error<RootSignatureValidationError<uint32_t>>(
-                       "Version", RSD.Version));
+    DeferredErrs = joinErrors(
+        std::move(DeferredErrs),
+        makeRSError(formatv("Invalid value for Version: {0}", RSD.Version)));
   }
 
   if (!hlsl::rootsig::verifyRootFlag(RSD.Flags)) {
-    DeferredErrs =
-        joinErrors(std::move(DeferredErrs),
-                   make_error<RootSignatureValidationError<uint32_t>>(
-                       "RootFlags", RSD.Flags));
+    DeferredErrs = joinErrors(
+        std::move(DeferredErrs),
+        makeRSError(formatv("Invalid value for RootFlags: {0}", RSD.Flags)));
   }
 
   for (const mcdxbc::RootParameterInfo &Info : RSD.ParametersContainer) {
@@ -639,16 +662,16 @@ Error MetadataParser::validateRootSignature(
       const mcdxbc::RootDescriptor &Descriptor =
           RSD.ParametersContainer.getRootDescriptor(Info.Location);
       if (!hlsl::rootsig::verifyRegisterValue(Descriptor.ShaderRegister))
-        DeferredErrs =
-            joinErrors(std::move(DeferredErrs),
-                       make_error<RootSignatureValidationError<uint32_t>>(
-                           "ShaderRegister", Descriptor.ShaderRegister));
+        DeferredErrs = joinErrors(
+            std::move(DeferredErrs),
+            makeRSError(formatv("Invalid value for ShaderRegister: {0}",
+                                Descriptor.ShaderRegister)));
 
       if (!hlsl::rootsig::verifyRegisterSpace(Descriptor.RegisterSpace))
-        DeferredErrs =
-            joinErrors(std::move(DeferredErrs),
-                       make_error<RootSignatureValidationError<uint32_t>>(
-                           "RegisterSpace", Descriptor.RegisterSpace));
+        DeferredErrs = joinErrors(
+            std::move(DeferredErrs),
+            makeRSError(formatv("Invalid value for RegisterSpace: {0}",
+                                Descriptor.RegisterSpace)));
 
       if (RSD.Version > 1) {
         bool IsValidFlag =
@@ -656,10 +679,10 @@ Error MetadataParser::validateRootSignature(
             hlsl::rootsig::verifyRootDescriptorFlag(
                 RSD.Version, dxbc::RootDescriptorFlags(Descriptor.Flags));
         if (!IsValidFlag)
-          DeferredErrs =
-              joinErrors(std::move(DeferredErrs),
-                         make_error<RootSignatureValidationError<uint32_t>>(
-                             "RootDescriptorFlag", Descriptor.Flags));
+          DeferredErrs = joinErrors(
+              std::move(DeferredErrs),
+              makeRSError(formatv("Invalid value for RootDescriptorFlag: {0}",
+                                  Descriptor.Flags)));
       }
       break;
     }
@@ -668,26 +691,26 @@ Error MetadataParser::validateRootSignature(
           RSD.ParametersContainer.getDescriptorTable(Info.Location);
       for (const mcdxbc::DescriptorRange &Range : Table) {
         if (!hlsl::rootsig::verifyRegisterSpace(Range.RegisterSpace))
-          DeferredErrs =
-              joinErrors(std::move(DeferredErrs),
-                         make_error<RootSignatureValidationError<uint32_t>>(
-                             "RegisterSpace", Range.RegisterSpace));
+          DeferredErrs = joinErrors(
+              std::move(DeferredErrs),
+              makeRSError(formatv("Invalid value for RegisterSpace: {0}",
+                                  Range.RegisterSpace)));
 
         if (!hlsl::rootsig::verifyNumDescriptors(Range.NumDescriptors))
-          DeferredErrs =
-              joinErrors(std::move(DeferredErrs),
-                         make_error<RootSignatureValidationError<uint32_t>>(
-                             "NumDescriptors", Range.NumDescriptors));
+          DeferredErrs = joinErrors(
+              std::move(DeferredErrs),
+              makeRSError(formatv("Invalid value for NumDescriptors: {0}",
+                                  Range.NumDescriptors)));
 
         bool IsValidFlag = dxbc::isValidDescriptorRangeFlags(Range.Flags) &&
                            hlsl::rootsig::verifyDescriptorRangeFlag(
                                RSD.Version, Range.RangeType,
                                dxbc::DescriptorRangeFlags(Range.Flags));
         if (!IsValidFlag)
-          DeferredErrs =
-              joinErrors(std::move(DeferredErrs),
-                         make_error<RootSignatureValidationError<uint32_t>>(
-                             "DescriptorFlag", Range.Flags));
+          DeferredErrs = joinErrors(
+              std::move(DeferredErrs),
+              makeRSError(formatv("Invalid value for DescriptorFlag: {0}",
+                                  Range.Flags)));
 
         if (Error Err =
                 validateDescriptorTableSamplerMixin(Table, Info.Location))
@@ -705,46 +728,49 @@ Error MetadataParser::validateRootSignature(
   for (const mcdxbc::StaticSampler &Sampler : RSD.StaticSamplers) {
 
     if (!hlsl::rootsig::verifyMipLODBias(Sampler.MipLODBias))
-      DeferredErrs = joinErrors(std::move(DeferredErrs),
-                                make_error<RootSignatureValidationError<float>>(
-                                    "MipLODBias", Sampler.MipLODBias));
+      DeferredErrs =
+          joinErrors(std::move(DeferredErrs),
+                     makeRSError(formatv("Invalid value for MipLODBias: {0:e}",
+                                         Sampler.MipLODBias)));
 
     if (!hlsl::rootsig::verifyMaxAnisotropy(Sampler.MaxAnisotropy))
       DeferredErrs =
           joinErrors(std::move(DeferredErrs),
-                     make_error<RootSignatureValidationError<uint32_t>>(
-                         "MaxAnisotropy", Sampler.MaxAnisotropy));
+                     makeRSError(formatv("Invalid value for MaxAnisotropy: {0}",
+                                         Sampler.MaxAnisotropy)));
 
     if (!hlsl::rootsig::verifyLOD(Sampler.MinLOD))
-      DeferredErrs = joinErrors(std::move(DeferredErrs),
-                                make_error<RootSignatureValidationError<float>>(
-                                    "MinLOD", Sampler.MinLOD));
+      DeferredErrs =
+          joinErrors(std::move(DeferredErrs),
+                     makeRSError(formatv("Invalid value for MinLOD: {0}",
+                                         Sampler.MinLOD)));
 
     if (!hlsl::rootsig::verifyLOD(Sampler.MaxLOD))
-      DeferredErrs = joinErrors(std::move(DeferredErrs),
-                                make_error<RootSignatureValidationError<float>>(
-                                    "MaxLOD", Sampler.MaxLOD));
-
-    if (!hlsl::rootsig::verifyRegisterValue(Sampler.ShaderRegister))
       DeferredErrs =
           joinErrors(std::move(DeferredErrs),
-                     make_error<RootSignatureValidationError<uint32_t>>(
-                         "ShaderRegister", Sampler.ShaderRegister));
+                     makeRSError(formatv("Invalid value for MaxLOD: {0}",
+                                         Sampler.MaxLOD)));
+
+    if (!hlsl::rootsig::verifyRegisterValue(Sampler.ShaderRegister))
+      DeferredErrs = joinErrors(
+          std::move(DeferredErrs),
+          makeRSError(formatv("Invalid value for ShaderRegister: {0}",
+                              Sampler.ShaderRegister)));
 
     if (!hlsl::rootsig::verifyRegisterSpace(Sampler.RegisterSpace))
       DeferredErrs =
           joinErrors(std::move(DeferredErrs),
-                     make_error<RootSignatureValidationError<uint32_t>>(
-                         "RegisterSpace", Sampler.RegisterSpace));
+                     makeRSError(formatv("Invalid value for RegisterSpace: {0}",
+                                         Sampler.RegisterSpace)));
     bool IsValidFlag =
         dxbc::isValidStaticSamplerFlags(Sampler.Flags) &&
         hlsl::rootsig::verifyStaticSamplerFlags(
             RSD.Version, dxbc::StaticSamplerFlags(Sampler.Flags));
     if (!IsValidFlag)
-      DeferredErrs =
-          joinErrors(std::move(DeferredErrs),
-                     make_error<RootSignatureValidationError<uint32_t>>(
-                         "Static Sampler Flag", Sampler.Flags));
+      DeferredErrs = joinErrors(
+          std::move(DeferredErrs),
+          makeRSError(formatv("Invalid value for Static Sampler Flag: {0}",
+                              Sampler.Flags)));
   }
 
   return DeferredErrs;
@@ -758,9 +784,9 @@ MetadataParser::ParseRootSignature(uint32_t Version) {
   for (const auto &Operand : Root->operands()) {
     MDNode *Element = dyn_cast<MDNode>(Operand);
     if (Element == nullptr)
-      return joinErrors(std::move(DeferredErrs),
-                        make_error<GenericRSMetadataError>(
-                            "Missing Root Element Metadata Node.", nullptr));
+      return joinErrors(
+          std::move(DeferredErrs),
+          makeRSError(formatv("Missing Root Element Metadata Node.")));
 
     if (auto Err = parseRootSignatureElement(RSD, Element))
       DeferredErrs = joinErrors(std::move(DeferredErrs), std::move(Err));
diff --git a/llvm/lib/IR/AsmWriter.cpp b/llvm/lib/IR/AsmWriter.cpp
index 245129f..ae086bcd 100644
--- a/llvm/lib/IR/AsmWriter.cpp
+++ b/llvm/lib/IR/AsmWriter.cpp
@@ -2369,8 +2369,12 @@ static void writeDICompileUnit(raw_ostream &Out, const DICompileUnit *N,
                                AsmWriterContext &WriterCtx) {
   Out << "!DICompileUnit(";
   MDFieldPrinter Printer(Out, WriterCtx);
-  Printer.printDwarfEnum("language", N->getSourceLanguage(),
-                         dwarf::LanguageString, /* ShouldSkipZero */ false);
+
+  Printer.printDwarfEnum("language",
+                         N->getSourceLanguage().getUnversionedName(),
+                         dwarf::LanguageString,
+                         /* ShouldSkipZero */ false);
+
   Printer.printMetadata("file", N->getRawFile(), /* ShouldSkipNull */ false);
   Printer.printString("producer", N->getProducer());
   Printer.printBool("isOptimized", N->isOptimized());
diff --git a/llvm/lib/IR/DIBuilder.cpp b/llvm/lib/IR/DIBuilder.cpp
index 1344df9..1ae20a9f 100644
--- a/llvm/lib/IR/DIBuilder.cpp
+++ b/llvm/lib/IR/DIBuilder.cpp
@@ -131,17 +131,13 @@ static DIScope *getNonCompileUnitScope(DIScope *N) {
 }
 
 DICompileUnit *DIBuilder::createCompileUnit(
-    unsigned Lang, DIFile *File, StringRef Producer, bool isOptimized,
-    StringRef Flags, unsigned RunTimeVer, StringRef SplitName,
+    DISourceLanguageName Lang, DIFile *File, StringRef Producer,
+    bool isOptimized, StringRef Flags, unsigned RunTimeVer, StringRef SplitName,
     DICompileUnit::DebugEmissionKind Kind, uint64_t DWOId,
     bool SplitDebugInlining, bool DebugInfoForProfiling,
     DICompileUnit::DebugNameTableKind NameTableKind, bool RangesBaseAddress,
     StringRef SysRoot, StringRef SDK) {
 
-  assert(((Lang <= dwarf::DW_LANG_Metal && Lang >= dwarf::DW_LANG_C89) ||
-          (Lang <= dwarf::DW_LANG_hi_user && Lang >= dwarf::DW_LANG_lo_user)) &&
-         "Invalid Language tag");
-
   assert(!CUNode && "Can only make one compile unit per DIBuilder instance");
   CUNode = DICompileUnit::getDistinct(
       VMContext, Lang, File, Producer, isOptimized, Flags, RunTimeVer,
diff --git a/llvm/lib/IR/DebugInfo.cpp b/llvm/lib/IR/DebugInfo.cpp
index f9ded50..9601a8a 100644
--- a/llvm/lib/IR/DebugInfo.cpp
+++ b/llvm/lib/IR/DebugInfo.cpp
@@ -1078,7 +1078,7 @@ LLVMMetadataRef LLVMDIBuilderCreateCompileUnit(
   auto File = unwrapDI<DIFile>(FileRef);
 
   return wrap(unwrap(Builder)->createCompileUnit(
-      map_from_llvmDWARFsourcelanguage(Lang), File,
+      DISourceLanguageName(map_from_llvmDWARFsourcelanguage(Lang)), File,
       StringRef(Producer, ProducerLen), isOptimized, StringRef(Flags, FlagsLen),
       RuntimeVer, StringRef(SplitName, SplitNameLen),
       static_cast<DICompileUnit::DebugEmissionKind>(Kind), DWOId,
diff --git a/llvm/lib/IR/DebugInfoMetadata.cpp b/llvm/lib/IR/DebugInfoMetadata.cpp
index 77d044b..e30df88 100644
--- a/llvm/lib/IR/DebugInfoMetadata.cpp
+++ b/llvm/lib/IR/DebugInfoMetadata.cpp
@@ -1184,9 +1184,10 @@ DIFile *DIFile::getImpl(LLVMContext &Context, MDString *Filename,
   DEFINE_GETIMPL_STORE(DIFile, (CS, Source), Ops);
 }
 DICompileUnit::DICompileUnit(LLVMContext &C, StorageType Storage,
-                             unsigned SourceLanguage, bool IsOptimized,
-                             unsigned RuntimeVersion, unsigned EmissionKind,
-                             uint64_t DWOId, bool SplitDebugInlining,
+                             DISourceLanguageName SourceLanguage,
+                             bool IsOptimized, unsigned RuntimeVersion,
+                             unsigned EmissionKind, uint64_t DWOId,
+                             bool SplitDebugInlining,
                              bool DebugInfoForProfiling, unsigned NameTableKind,
                              bool RangesBaseAddress, ArrayRef<Metadata *> Ops)
     : DIScope(C, DICompileUnitKind, Storage, dwarf::DW_TAG_compile_unit, Ops),
@@ -1199,7 +1200,7 @@ DICompileUnit::DICompileUnit(LLVMContext &C, StorageType Storage,
 }
 
 DICompileUnit *DICompileUnit::getImpl(
-    LLVMContext &Context, unsigned SourceLanguage, Metadata *File,
+    LLVMContext &Context, DISourceLanguageName SourceLanguage, Metadata *File,
     MDString *Producer, bool IsOptimized, MDString *Flags,
     unsigned RuntimeVersion, MDString *SplitDebugFilename,
     unsigned EmissionKind, Metadata *EnumTypes, Metadata *RetainedTypes,
diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index 20dcde8..53cf004 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -1111,6 +1111,8 @@ Expected<SimplifyCFGOptions> parseSimplifyCFGOptions(StringRef Params) {
       Result.forwardSwitchCondToPhi(Enable);
     } else if (ParamName == "switch-range-to-icmp") {
       Result.convertSwitchRangeToICmp(Enable);
+    } else if (ParamName == "switch-to-arithmetic") {
+      Result.convertSwitchToArithmetic(Enable);
     } else if (ParamName == "switch-to-lookup") {
       Result.convertSwitchToLookupTable(Enable);
     } else if (ParamName == "keep-loops") {
diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp
index 119caea..fea0d25 100644
--- a/llvm/lib/Passes/PassBuilderPipelines.cpp
+++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
@@ -781,6 +781,7 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level,
 
   FPM.addPass(SimplifyCFGPass(SimplifyCFGOptions()
                                   .convertSwitchRangeToICmp(true)
+                                  .convertSwitchToArithmetic(true)
                                   .hoistCommonInsts(true)
                                   .sinkCommonInsts(true)));
   FPM.addPass(InstCombinePass());
@@ -1377,6 +1378,7 @@ void PassBuilder::addVectorPasses(OptimizationLevel Level,
   FPM.addPass(SimplifyCFGPass(SimplifyCFGOptions()
                                   .forwardSwitchCondToPhi(true)
                                   .convertSwitchRangeToICmp(true)
+                                  .convertSwitchToArithmetic(true)
                                   .convertSwitchToLookupTable(true)
                                   .needCanonicalLoops(false)
                                   .hoistCommonInsts(true)
@@ -1603,6 +1605,7 @@ PassBuilder::buildModuleOptimizationPipeline(OptimizationLevel Level,
   OptimizePM.addPass(
       SimplifyCFGPass(SimplifyCFGOptions()
                           .convertSwitchRangeToICmp(true)
+                          .convertSwitchToArithmetic(true)
                           .speculateUnpredictables(true)
                           .hoistLoadsStoresWithCondFaulting(true)));
 
@@ -2187,6 +2190,7 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level,
   // Delete basic blocks, which optimization passes may have killed.
   LateFPM.addPass(SimplifyCFGPass(SimplifyCFGOptions()
                                       .convertSwitchRangeToICmp(true)
+                                      .convertSwitchToArithmetic(true)
                                       .hoistCommonInsts(true)
                                       .speculateUnpredictables(true)));
   MPM.addPass(createModuleToFunctionPassAdaptor(std::move(LateFPM)));
diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def
index c5c0d64..1b16525 100644
--- a/llvm/lib/Passes/PassRegistry.def
+++ b/llvm/lib/Passes/PassRegistry.def
@@ -687,8 +687,9 @@ FUNCTION_PASS_WITH_PARAMS(
     parseSimplifyCFGOptions,
     "no-speculate-blocks;speculate-blocks;no-simplify-cond-branch;"
     "simplify-cond-branch;no-forward-switch-cond;forward-switch-cond;"
-    "no-switch-range-to-icmp;switch-range-to-icmp;no-switch-to-lookup;"
-    "switch-to-lookup;no-keep-loops;keep-loops;no-hoist-common-insts;"
+    "no-switch-range-to-icmp;switch-range-to-icmp;no-switch-to-arithmetic;"
+    "switch-to-arithmetic;no-switch-to-lookup;switch-to-lookup;"
+    "no-keep-loops;keep-loops;no-hoist-common-insts;"
     "hoist-common-insts;no-hoist-loads-stores-with-cond-faulting;"
     "hoist-loads-stores-with-cond-faulting;no-sink-common-insts;"
     "sink-common-insts;no-speculate-unpredictables;speculate-unpredictables;"
diff --git a/llvm/lib/Support/SpecialCaseList.cpp b/llvm/lib/Support/SpecialCaseList.cpp
index 6ad8d7d..f184efa 100644
--- a/llvm/lib/Support/SpecialCaseList.cpp
+++ b/llvm/lib/Support/SpecialCaseList.cpp
@@ -22,6 +22,7 @@
 #include "llvm/Support/VirtualFileSystem.h"
 #include <algorithm>
 #include <limits>
+#include <memory>
 #include <stdio.h>
 #include <string>
 #include <system_error>
@@ -72,6 +73,8 @@ Error SpecialCaseList::Matcher::insert(StringRef Pattern, unsigned LineNumber,
 void SpecialCaseList::Matcher::match(
     StringRef Query,
     llvm::function_ref<void(StringRef Rule, unsigned LineNo)> Cb) const {
+  if (RemoveDotSlash)
+    Query = llvm::sys::path::remove_leading_dotslash(Query);
   for (const auto &Glob : reverse(Globs))
     if (Glob->Pattern.match(Query))
       Cb(Glob->Name, Glob->LineNo);
@@ -164,12 +167,18 @@ bool SpecialCaseList::parse(unsigned FileIdx, const MemoryBuffer *MB,
   // https://discourse.llvm.org/t/use-glob-instead-of-regex-for-specialcaselists/71666
   bool UseGlobs = Version > 1;
 
+  bool RemoveDotSlash = Version > 2;
+
   Section *CurrentSection;
   if (auto Err = addSection("*", FileIdx, 1).moveInto(CurrentSection)) {
     Error = toString(std::move(Err));
     return false;
   }
 
+  // This is the current list of prefixes for all existing users matching file
+  // path. We may need parametrization in constructor in future.
+  constexpr StringRef PathPrefixes[] = {"src", "!src", "mainfile", "source"};
+
   for (line_iterator LineIt(*MB, /*SkipBlanks=*/true, /*CommentMarker=*/'#');
        !LineIt.is_at_eof(); LineIt++) {
     unsigned LineNo = LineIt.line_number();
@@ -205,6 +214,8 @@ bool SpecialCaseList::parse(unsigned FileIdx, const MemoryBuffer *MB,
 
     auto [Pattern, Category] = Postfix.split("=");
     auto &Entry = CurrentSection->Entries[Prefix][Category];
+    Entry.RemoveDotSlash =
+        RemoveDotSlash && llvm::is_contained(PathPrefixes, Prefix);
     if (auto Err = Entry.insert(Pattern, LineNo, UseGlobs)) {
       Error =
           (Twine("malformed ") + (UseGlobs ? "glob" : "regex") + " in line " +
@@ -262,4 +273,17 @@ unsigned SpecialCaseList::Section::getLastMatch(StringRef Prefix,
   return LastLine;
 }
 
+StringRef SpecialCaseList::Section::getLongestMatch(StringRef Prefix,
+                                                    StringRef Query,
+                                                    StringRef Category) const {
+  StringRef LongestRule;
+  if (const Matcher *M = findMatcher(Prefix, Category)) {
+    M->match(Query, [&](StringRef Rule, unsigned) {
+      if (LongestRule.size() < Rule.size())
+        LongestRule = Rule;
+    });
+  }
+  return LongestRule;
+}
+
 } // namespace llvm
diff --git a/llvm/lib/Target/AArch64/AArch64SystemOperands.td b/llvm/lib/Target/AArch64/AArch64SystemOperands.td
index 65b752e..9438917 100644
--- a/llvm/lib/Target/AArch64/AArch64SystemOperands.td
+++ b/llvm/lib/Target/AArch64/AArch64SystemOperands.td
@@ -816,8 +816,8 @@ def : BTI<"jc", 0b110>;
 // TLBI (translation lookaside buffer invalidate) instruction options.
 //===----------------------------------------------------------------------===//
 
-class TLBIEntry<string name, bits<3> op1, bits<4> crn, bits<4> crm,
-             bits<3> op2, bit needsreg> {
+class TLBICommon<string name, bits<3> op1, bits<4> crn, bits<4> crm,
+                 bits<3> op2, bit needsreg> {
   string Name = name;
   bits<14> Encoding;
   let Encoding{13-11} = op1;
@@ -830,131 +830,150 @@ class TLBIEntry<string name, bits<3> op1, bits<4> crn, bits<4> crm,
   code RequiresStr = [{ { }] # !interleave(Requires # ExtraRequires, [{, }]) # [{ } }];
 }
 
-def TLBITable : GenericTable {
-  let FilterClass = "TLBIEntry";
-  let CppTypeName = "TLBI";
-  let Fields = ["Name", "Encoding", "NeedsReg", "RequiresStr"];
-
-  let PrimaryKey = ["Encoding"];
-  let PrimaryKeyName = "lookupTLBIByEncoding";
+class TLBIEntry<string name, bits<3> op1, bits<4> crn, bits<4> crm,
+                bits<3> op2, bit needsreg>
+  : TLBICommon<name, op1, crn, crm, op2, needsreg>;
+
+class TLBIPEntry<string name, bits<3> op1, bits<4> crn, bits<4> crm,
+                 bits<3> op2, bit needsreg>
+  : TLBICommon<name, op1, crn, crm, op2, needsreg>;
+
+multiclass TLBITableBase {
+  def NAME # Table : GenericTable {
+    let FilterClass = NAME # "Entry";
+    let CppTypeName = NAME;
+    let Fields = ["Name", "Encoding", "NeedsReg", "RequiresStr"];
+    let PrimaryKey = ["Encoding"];
+    let PrimaryKeyName = "lookup" # NAME # "ByEncoding";
+  }
+  def lookup # NAME # ByName : SearchIndex {
+    let Table = !cast<GenericTable>(NAME # "Table");
+    let Key = ["Name"];
+  }
 }
 
-def lookupTLBIByName : SearchIndex {
-  let Table = TLBITable;
-  let Key = ["Name"];
-}
+defm TLBI  : TLBITableBase;
+defm TLBIP : TLBITableBase;
 
-multiclass TLBI<string name, bits<3> op1, bits<4> crn, bits<4> crm,
+multiclass TLBI<string name, bit hasTLBIP, bits<3> op1, bits<4> crn, bits<4> crm,
              bits<3> op2, bit needsreg = 1> {
   def : TLBIEntry<name, op1, crn, crm, op2, needsreg>;
   def : TLBIEntry<!strconcat(name, "nXS"), op1, crn, crm, op2, needsreg> {
     let Encoding{7} = 1;
     let ExtraRequires = ["AArch64::FeatureXS"];
   }
+  if !eq(hasTLBIP, true) then {
+    def : TLBIPEntry<name, op1, crn, crm, op2, needsreg>;
+    def : TLBIPEntry<!strconcat(name, "nXS"), op1, crn, crm, op2, needsreg> {
+      let Encoding{7} = 1;
+      let ExtraRequires = ["AArch64::FeatureXS"];
+    }
+  }
 }
 
-defm : TLBI<"IPAS2E1IS",    0b100, 0b1000, 0b0000, 0b001>;
-defm : TLBI<"IPAS2LE1IS",   0b100, 0b1000, 0b0000, 0b101>;
-defm : TLBI<"VMALLE1IS",    0b000, 0b1000, 0b0011, 0b000, 0>;
-defm : TLBI<"ALLE2IS",      0b100, 0b1000, 0b0011, 0b000, 0>;
-defm : TLBI<"ALLE3IS",      0b110, 0b1000, 0b0011, 0b000, 0>;
-defm : TLBI<"VAE1IS",       0b000, 0b1000, 0b0011, 0b001>;
-defm : TLBI<"VAE2IS",       0b100, 0b1000, 0b0011, 0b001>;
-defm : TLBI<"VAE3IS",       0b110, 0b1000, 0b0011, 0b001>;
-defm : TLBI<"ASIDE1IS",     0b000, 0b1000, 0b0011, 0b010>;
-defm : TLBI<"VAAE1IS",      0b000, 0b1000, 0b0011, 0b011>;
-defm : TLBI<"ALLE1IS",      0b100, 0b1000, 0b0011, 0b100, 0>;
-defm : TLBI<"VALE1IS",      0b000, 0b1000, 0b0011, 0b101>;
-defm : TLBI<"VALE2IS",      0b100, 0b1000, 0b0011, 0b101>;
-defm : TLBI<"VALE3IS",      0b110, 0b1000, 0b0011, 0b101>;
-defm : TLBI<"VMALLS12E1IS", 0b100, 0b1000, 0b0011, 0b110, 0>;
-defm : TLBI<"VAALE1IS",     0b000, 0b1000, 0b0011, 0b111>;
-defm : TLBI<"IPAS2E1",      0b100, 0b1000, 0b0100, 0b001>;
-defm : TLBI<"IPAS2LE1",     0b100, 0b1000, 0b0100, 0b101>;
-defm : TLBI<"VMALLE1",      0b000, 0b1000, 0b0111, 0b000, 0>;
-defm : TLBI<"ALLE2",        0b100, 0b1000, 0b0111, 0b000, 0>;
-defm : TLBI<"ALLE3",        0b110, 0b1000, 0b0111, 0b000, 0>;
-defm : TLBI<"VAE1",         0b000, 0b1000, 0b0111, 0b001>;
-defm : TLBI<"VAE2",         0b100, 0b1000, 0b0111, 0b001>;
-defm : TLBI<"VAE3",         0b110, 0b1000, 0b0111, 0b001>;
-defm : TLBI<"ASIDE1",       0b000, 0b1000, 0b0111, 0b010>;
-defm : TLBI<"VAAE1",        0b000, 0b1000, 0b0111, 0b011>;
-defm : TLBI<"ALLE1",        0b100, 0b1000, 0b0111, 0b100, 0>;
-defm : TLBI<"VALE1",        0b000, 0b1000, 0b0111, 0b101>;
-defm : TLBI<"VALE2",        0b100, 0b1000, 0b0111, 0b101>;
-defm : TLBI<"VALE3",        0b110, 0b1000, 0b0111, 0b101>;
-defm : TLBI<"VMALLS12E1",   0b100, 0b1000, 0b0111, 0b110, 0>;
-defm : TLBI<"VAALE1",       0b000, 0b1000, 0b0111, 0b111>;
+//                   hasTLBIP  op1    CRn     CRm     op2    needsreg
+defm : TLBI<"IPAS2E1IS",    1, 0b100, 0b1000, 0b0000, 0b001>;
+defm : TLBI<"IPAS2LE1IS",   1, 0b100, 0b1000, 0b0000, 0b101>;
+defm : TLBI<"VMALLE1IS",    0, 0b000, 0b1000, 0b0011, 0b000, 0>;
+defm : TLBI<"ALLE2IS",      0, 0b100, 0b1000, 0b0011, 0b000, 0>;
+defm : TLBI<"ALLE3IS",      0, 0b110, 0b1000, 0b0011, 0b000, 0>;
+defm : TLBI<"VAE1IS",       1, 0b000, 0b1000, 0b0011, 0b001>;
+defm : TLBI<"VAE2IS",       1, 0b100, 0b1000, 0b0011, 0b001>;
+defm : TLBI<"VAE3IS",       1, 0b110, 0b1000, 0b0011, 0b001>;
+defm : TLBI<"ASIDE1IS",     0, 0b000, 0b1000, 0b0011, 0b010>;
+defm : TLBI<"VAAE1IS",      1, 0b000, 0b1000, 0b0011, 0b011>;
+defm : TLBI<"ALLE1IS",      0, 0b100, 0b1000, 0b0011, 0b100, 0>;
+defm : TLBI<"VALE1IS",      1, 0b000, 0b1000, 0b0011, 0b101>;
+defm : TLBI<"VALE2IS",      1, 0b100, 0b1000, 0b0011, 0b101>;
+defm : TLBI<"VALE3IS",      1, 0b110, 0b1000, 0b0011, 0b101>;
+defm : TLBI<"VMALLS12E1IS", 0, 0b100, 0b1000, 0b0011, 0b110, 0>;
+defm : TLBI<"VAALE1IS",     1, 0b000, 0b1000, 0b0011, 0b111>;
+defm : TLBI<"IPAS2E1",      1, 0b100, 0b1000, 0b0100, 0b001>;
+defm : TLBI<"IPAS2LE1",     1, 0b100, 0b1000, 0b0100, 0b101>;
+defm : TLBI<"VMALLE1",      0, 0b000, 0b1000, 0b0111, 0b000, 0>;
+defm : TLBI<"ALLE2",        0, 0b100, 0b1000, 0b0111, 0b000, 0>;
+defm : TLBI<"ALLE3",        0, 0b110, 0b1000, 0b0111, 0b000, 0>;
+defm : TLBI<"VAE1",         1, 0b000, 0b1000, 0b0111, 0b001>;
+defm : TLBI<"VAE2",         1, 0b100, 0b1000, 0b0111, 0b001>;
+defm : TLBI<"VAE3",         1, 0b110, 0b1000, 0b0111, 0b001>;
+defm : TLBI<"ASIDE1",       0, 0b000, 0b1000, 0b0111, 0b010>;
+defm : TLBI<"VAAE1",        1, 0b000, 0b1000, 0b0111, 0b011>;
+defm : TLBI<"ALLE1",        0, 0b100, 0b1000, 0b0111, 0b100, 0>;
+defm : TLBI<"VALE1",        1, 0b000, 0b1000, 0b0111, 0b101>;
+defm : TLBI<"VALE2",        1, 0b100, 0b1000, 0b0111, 0b101>;
+defm : TLBI<"VALE3",        1, 0b110, 0b1000, 0b0111, 0b101>;
+defm : TLBI<"VMALLS12E1",   0, 0b100, 0b1000, 0b0111, 0b110, 0>;
+defm : TLBI<"VAALE1",       1, 0b000, 0b1000, 0b0111, 0b111>;
 
 // Armv8.4-A Translation Lookaside Buffer Instructions (TLBI)
 let Requires = ["AArch64::FeatureTLB_RMI"] in {
 // Armv8.4-A Outer Sharable TLB Maintenance instructions:
-//                         op1    CRn     CRm     op2
-defm : TLBI<"VMALLE1OS",    0b000, 0b1000, 0b0001, 0b000, 0>;
-defm : TLBI<"VAE1OS",       0b000, 0b1000, 0b0001, 0b001>;
-defm : TLBI<"ASIDE1OS",     0b000, 0b1000, 0b0001, 0b010>;
-defm : TLBI<"VAAE1OS",      0b000, 0b1000, 0b0001, 0b011>;
-defm : TLBI<"VALE1OS",      0b000, 0b1000, 0b0001, 0b101>;
-defm : TLBI<"VAALE1OS",     0b000, 0b1000, 0b0001, 0b111>;
-defm : TLBI<"IPAS2E1OS",    0b100, 0b1000, 0b0100, 0b000>;
-defm : TLBI<"IPAS2LE1OS",   0b100, 0b1000, 0b0100, 0b100>;
-defm : TLBI<"VAE2OS",       0b100, 0b1000, 0b0001, 0b001>;
-defm : TLBI<"VALE2OS",      0b100, 0b1000, 0b0001, 0b101>;
-defm : TLBI<"VMALLS12E1OS", 0b100, 0b1000, 0b0001, 0b110, 0>;
-defm : TLBI<"VAE3OS",       0b110, 0b1000, 0b0001, 0b001>;
-defm : TLBI<"VALE3OS",      0b110, 0b1000, 0b0001, 0b101>;
-defm : TLBI<"ALLE2OS",      0b100, 0b1000, 0b0001, 0b000, 0>;
-defm : TLBI<"ALLE1OS",      0b100, 0b1000, 0b0001, 0b100, 0>;
-defm : TLBI<"ALLE3OS",      0b110, 0b1000, 0b0001, 0b000, 0>;
+//                   hasTLBIP  op1    CRn     CRm     op2    needsreg
+defm : TLBI<"VMALLE1OS",    0, 0b000, 0b1000, 0b0001, 0b000, 0>;
+defm : TLBI<"VAE1OS",       1, 0b000, 0b1000, 0b0001, 0b001>;
+defm : TLBI<"ASIDE1OS",     0, 0b000, 0b1000, 0b0001, 0b010>;
+defm : TLBI<"VAAE1OS",      1, 0b000, 0b1000, 0b0001, 0b011>;
+defm : TLBI<"VALE1OS",      1, 0b000, 0b1000, 0b0001, 0b101>;
+defm : TLBI<"VAALE1OS",     1, 0b000, 0b1000, 0b0001, 0b111>;
+defm : TLBI<"IPAS2E1OS",    1, 0b100, 0b1000, 0b0100, 0b000>;
+defm : TLBI<"IPAS2LE1OS",   1, 0b100, 0b1000, 0b0100, 0b100>;
+defm : TLBI<"VAE2OS",       1, 0b100, 0b1000, 0b0001, 0b001>;
+defm : TLBI<"VALE2OS",      1, 0b100, 0b1000, 0b0001, 0b101>;
+defm : TLBI<"VMALLS12E1OS", 0, 0b100, 0b1000, 0b0001, 0b110, 0>;
+defm : TLBI<"VAE3OS",       1, 0b110, 0b1000, 0b0001, 0b001>;
+defm : TLBI<"VALE3OS",      1, 0b110, 0b1000, 0b0001, 0b101>;
+defm : TLBI<"ALLE2OS",      0, 0b100, 0b1000, 0b0001, 0b000, 0>;
+defm : TLBI<"ALLE1OS",      0, 0b100, 0b1000, 0b0001, 0b100, 0>;
+defm : TLBI<"ALLE3OS",      0, 0b110, 0b1000, 0b0001, 0b000, 0>;
 
 // Armv8.4-A TLB Range Maintenance instructions:
-//                         op1    CRn     CRm     op2
-defm : TLBI<"RVAE1",        0b000, 0b1000, 0b0110, 0b001>;
-defm : TLBI<"RVAAE1",       0b000, 0b1000, 0b0110, 0b011>;
-defm : TLBI<"RVALE1",       0b000, 0b1000, 0b0110, 0b101>;
-defm : TLBI<"RVAALE1",      0b000, 0b1000, 0b0110, 0b111>;
-defm : TLBI<"RVAE1IS",      0b000, 0b1000, 0b0010, 0b001>;
-defm : TLBI<"RVAAE1IS",     0b000, 0b1000, 0b0010, 0b011>;
-defm : TLBI<"RVALE1IS",     0b000, 0b1000, 0b0010, 0b101>;
-defm : TLBI<"RVAALE1IS",    0b000, 0b1000, 0b0010, 0b111>;
-defm : TLBI<"RVAE1OS",      0b000, 0b1000, 0b0101, 0b001>;
-defm : TLBI<"RVAAE1OS",     0b000, 0b1000, 0b0101, 0b011>;
-defm : TLBI<"RVALE1OS",     0b000, 0b1000, 0b0101, 0b101>;
-defm : TLBI<"RVAALE1OS",    0b000, 0b1000, 0b0101, 0b111>;
-defm : TLBI<"RIPAS2E1IS",   0b100, 0b1000, 0b0000, 0b010>;
-defm : TLBI<"RIPAS2LE1IS",  0b100, 0b1000, 0b0000, 0b110>;
-defm : TLBI<"RIPAS2E1",     0b100, 0b1000, 0b0100, 0b010>;
-defm : TLBI<"RIPAS2LE1",    0b100, 0b1000, 0b0100, 0b110>;
-defm : TLBI<"RIPAS2E1OS",   0b100, 0b1000, 0b0100, 0b011>;
-defm : TLBI<"RIPAS2LE1OS",  0b100, 0b1000, 0b0100, 0b111>;
-defm : TLBI<"RVAE2",        0b100, 0b1000, 0b0110, 0b001>;
-defm : TLBI<"RVALE2",       0b100, 0b1000, 0b0110, 0b101>;
-defm : TLBI<"RVAE2IS",      0b100, 0b1000, 0b0010, 0b001>;
-defm : TLBI<"RVALE2IS",     0b100, 0b1000, 0b0010, 0b101>;
-defm : TLBI<"RVAE2OS",      0b100, 0b1000, 0b0101, 0b001>;
-defm : TLBI<"RVALE2OS",     0b100, 0b1000, 0b0101, 0b101>;
-defm : TLBI<"RVAE3",        0b110, 0b1000, 0b0110, 0b001>;
-defm : TLBI<"RVALE3",       0b110, 0b1000, 0b0110, 0b101>;
-defm : TLBI<"RVAE3IS",      0b110, 0b1000, 0b0010, 0b001>;
-defm : TLBI<"RVALE3IS",     0b110, 0b1000, 0b0010, 0b101>;
-defm : TLBI<"RVAE3OS",      0b110, 0b1000, 0b0101, 0b001>;
-defm : TLBI<"RVALE3OS",     0b110, 0b1000, 0b0101, 0b101>;
+//                   hasTLBIP  op1    CRn     CRm     op2    needsreg
+defm : TLBI<"RVAE1",        1, 0b000, 0b1000, 0b0110, 0b001>;
+defm : TLBI<"RVAAE1",       1, 0b000, 0b1000, 0b0110, 0b011>;
+defm : TLBI<"RVALE1",       1, 0b000, 0b1000, 0b0110, 0b101>;
+defm : TLBI<"RVAALE1",      1, 0b000, 0b1000, 0b0110, 0b111>;
+defm : TLBI<"RVAE1IS",      1, 0b000, 0b1000, 0b0010, 0b001>;
+defm : TLBI<"RVAAE1IS",     1, 0b000, 0b1000, 0b0010, 0b011>;
+defm : TLBI<"RVALE1IS",     1, 0b000, 0b1000, 0b0010, 0b101>;
+defm : TLBI<"RVAALE1IS",    1, 0b000, 0b1000, 0b0010, 0b111>;
+defm : TLBI<"RVAE1OS",      1, 0b000, 0b1000, 0b0101, 0b001>;
+defm : TLBI<"RVAAE1OS",     1, 0b000, 0b1000, 0b0101, 0b011>;
+defm : TLBI<"RVALE1OS",     1, 0b000, 0b1000, 0b0101, 0b101>;
+defm : TLBI<"RVAALE1OS",    1, 0b000, 0b1000, 0b0101, 0b111>;
+defm : TLBI<"RIPAS2E1IS",   1, 0b100, 0b1000, 0b0000, 0b010>;
+defm : TLBI<"RIPAS2LE1IS",  1, 0b100, 0b1000, 0b0000, 0b110>;
+defm : TLBI<"RIPAS2E1",     1, 0b100, 0b1000, 0b0100, 0b010>;
+defm : TLBI<"RIPAS2LE1",    1, 0b100, 0b1000, 0b0100, 0b110>;
+defm : TLBI<"RIPAS2E1OS",   1, 0b100, 0b1000, 0b0100, 0b011>;
+defm : TLBI<"RIPAS2LE1OS",  1, 0b100, 0b1000, 0b0100, 0b111>;
+defm : TLBI<"RVAE2",        1, 0b100, 0b1000, 0b0110, 0b001>;
+defm : TLBI<"RVALE2",       1, 0b100, 0b1000, 0b0110, 0b101>;
+defm : TLBI<"RVAE2IS",      1, 0b100, 0b1000, 0b0010, 0b001>;
+defm : TLBI<"RVALE2IS",     1, 0b100, 0b1000, 0b0010, 0b101>;
+defm : TLBI<"RVAE2OS",      1, 0b100, 0b1000, 0b0101, 0b001>;
+defm : TLBI<"RVALE2OS",     1, 0b100, 0b1000, 0b0101, 0b101>;
+defm : TLBI<"RVAE3",        1, 0b110, 0b1000, 0b0110, 0b001>;
+defm : TLBI<"RVALE3",       1, 0b110, 0b1000, 0b0110, 0b101>;
+defm : TLBI<"RVAE3IS",      1, 0b110, 0b1000, 0b0010, 0b001>;
+defm : TLBI<"RVALE3IS",     1, 0b110, 0b1000, 0b0010, 0b101>;
+defm : TLBI<"RVAE3OS",      1, 0b110, 0b1000, 0b0101, 0b001>;
+defm : TLBI<"RVALE3OS",     1, 0b110, 0b1000, 0b0101, 0b101>;
 } //FeatureTLB_RMI
 
 // Armv9-A Realm Management Extension TLBI Instructions
 let Requires = ["AArch64::FeatureRME"] in {
-defm : TLBI<"RPAOS",        0b110, 0b1000, 0b0100, 0b011>;
-defm : TLBI<"RPALOS",       0b110, 0b1000, 0b0100, 0b111>;
-defm : TLBI<"PAALLOS",      0b110, 0b1000, 0b0001, 0b100, 0>;
-defm : TLBI<"PAALL",        0b110, 0b1000, 0b0111, 0b100, 0>;
+defm : TLBI<"RPAOS",        0, 0b110, 0b1000, 0b0100, 0b011>;
+defm : TLBI<"RPALOS",       0, 0b110, 0b1000, 0b0100, 0b111>;
+defm : TLBI<"PAALLOS",      0, 0b110, 0b1000, 0b0001, 0b100, 0>;
+defm : TLBI<"PAALL",        0, 0b110, 0b1000, 0b0111, 0b100, 0>;
 }
 
 // Armv9.5-A TLBI VMALL for Dirty State
 let Requires = ["AArch64::FeatureTLBIW"] in {
-//                           op1,   CRn,    CRm,    op2,   needsreg
-defm : TLBI<"VMALLWS2E1",    0b100, 0b1000, 0b0110, 0b010, 0>;
-defm : TLBI<"VMALLWS2E1IS",  0b100, 0b1000, 0b0010, 0b010, 0>;
-defm : TLBI<"VMALLWS2E1OS",  0b100, 0b1000, 0b0101, 0b010, 0>;
+//                                           op1,   CRn,    CRm,    op2,   needsreg
+defm : TLBI<"VMALLWS2E1",    0, 0b100, 0b1000, 0b0110, 0b010, 0>;
+defm : TLBI<"VMALLWS2E1IS",  0, 0b100, 0b1000, 0b0010, 0b010, 0>;
+defm : TLBI<"VMALLWS2E1OS",  0, 0b100, 0b1000, 0b0101, 0b010, 0>;
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
index 3641e22..2c3870c 100644
--- a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
+++ b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
@@ -4020,23 +4020,23 @@ bool AArch64AsmParser::parseSyspAlias(StringRef Name, SMLoc NameLoc,
     if (HasnXSQualifier) {
       Op = Op.drop_back(3);
     }
-    const AArch64TLBI::TLBI *TLBIorig = AArch64TLBI::lookupTLBIByName(Op);
-    if (!TLBIorig)
+    const AArch64TLBIP::TLBIP *TLBIPorig = AArch64TLBIP::lookupTLBIPByName(Op);
+    if (!TLBIPorig)
       return TokError("invalid operand for TLBIP instruction");
-    const AArch64TLBI::TLBI TLBI(
-        TLBIorig->Name, TLBIorig->Encoding | (HasnXSQualifier ? (1 << 7) : 0),
-        TLBIorig->NeedsReg,
+    const AArch64TLBIP::TLBIP TLBIP(
+        TLBIPorig->Name, TLBIPorig->Encoding | (HasnXSQualifier ? (1 << 7) : 0),
+        TLBIPorig->NeedsReg,
         HasnXSQualifier
-            ? TLBIorig->FeaturesRequired | FeatureBitset({AArch64::FeatureXS})
-            : TLBIorig->FeaturesRequired);
-    if (!TLBI.haveFeatures(getSTI().getFeatureBits())) {
+            ? TLBIPorig->FeaturesRequired | FeatureBitset({AArch64::FeatureXS})
+            : TLBIPorig->FeaturesRequired);
+    if (!TLBIP.haveFeatures(getSTI().getFeatureBits())) {
       std::string Name =
-          std::string(TLBI.Name) + (HasnXSQualifier ? "nXS" : "");
+          std::string(TLBIP.Name) + (HasnXSQualifier ? "nXS" : "");
       std::string Str("TLBIP " + Name + " requires: ");
-      setRequiredFeatureString(TLBI.getRequiredFeatures(), Str);
+      setRequiredFeatureString(TLBIP.getRequiredFeatures(), Str);
       return TokError(Str);
     }
-    createSysAlias(TLBI.Encoding, Operands, S);
+    createSysAlias(TLBIP.Encoding, Operands, S);
   }
 
   Lex(); // Eat operand.
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp
index 2552ee3..35bd244 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp
@@ -1066,12 +1066,13 @@ bool AArch64InstPrinter::printSyspAlias(const MCInst *MI,
       Encoding &= ~(1 << 7);
     }
 
-    const AArch64TLBI::TLBI *TLBI = AArch64TLBI::lookupTLBIByEncoding(Encoding);
-    if (!TLBI || !TLBI->haveFeatures(STI.getFeatureBits()))
+    const AArch64TLBIP::TLBIP *TLBIP =
+        AArch64TLBIP::lookupTLBIPByEncoding(Encoding);
+    if (!TLBIP || !TLBIP->haveFeatures(STI.getFeatureBits()))
       return false;
 
     Ins = "tlbip\t";
-    Name = std::string(TLBI->Name);
+    Name = std::string(TLBIP->Name);
     if (CnVal == 9)
       Name += "nXS";
   } else
diff --git a/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp b/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp
index 7767028..d6cb0e8 100644
--- a/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp
+++ b/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp
@@ -186,6 +186,13 @@ namespace llvm {
 }
 
 namespace llvm {
+namespace AArch64TLBIP {
+#define GET_TLBIPTable_IMPL
+#include "AArch64GenSystemOperands.inc"
+} // namespace AArch64TLBIP
+} // namespace llvm
+
+namespace llvm {
   namespace AArch64SVCR {
 #define GET_SVCRsList_IMPL
 #include "AArch64GenSystemOperands.inc"
diff --git a/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h b/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h
index a4ee963..fea33ef 100644
--- a/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h
+++ b/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h
@@ -795,6 +795,14 @@ namespace AArch64TLBI {
   #include "AArch64GenSystemOperands.inc"
 }
 
+namespace AArch64TLBIP {
+struct TLBIP : SysAliasReg {
+  using SysAliasReg::SysAliasReg;
+};
+#define GET_TLBIPTable_DECL
+#include "AArch64GenSystemOperands.inc"
+} // namespace AArch64TLBIP
+
 namespace AArch64II {
 /// Target Operand Flag enum.
 enum TOF {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
index ef58004..9907c88f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
@@ -1288,16 +1288,17 @@ static unsigned inlineAsmGetNumRequiredAGPRs(const InlineAsm *IA,
   return std::min(MaxVirtReg + MaxPhysReg, 256u);
 }
 
-// TODO: Migrate to range merge of amdgpu-agpr-alloc.
-struct AAAMDGPUNoAGPR : public StateWrapper<BooleanState, AbstractAttribute> {
-  using Base = StateWrapper<BooleanState, AbstractAttribute>;
-  AAAMDGPUNoAGPR(const IRPosition &IRP, Attributor &A) : Base(IRP) {}
+struct AAAMDGPUMinAGPRAlloc
+    : public StateWrapper<DecIntegerState<>, AbstractAttribute> {
+  using Base = StateWrapper<DecIntegerState<>, AbstractAttribute>;
+  AAAMDGPUMinAGPRAlloc(const IRPosition &IRP, Attributor &A) : Base(IRP) {}
 
-  static AAAMDGPUNoAGPR &createForPosition(const IRPosition &IRP,
-                                           Attributor &A) {
+  static AAAMDGPUMinAGPRAlloc &createForPosition(const IRPosition &IRP,
+                                                 Attributor &A) {
     if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION)
-      return *new (A.Allocator) AAAMDGPUNoAGPR(IRP, A);
-    llvm_unreachable("AAAMDGPUNoAGPR is only valid for function position");
+      return *new (A.Allocator) AAAMDGPUMinAGPRAlloc(IRP, A);
+    llvm_unreachable(
+        "AAAMDGPUMinAGPRAlloc is only valid for function position");
   }
 
   void initialize(Attributor &A) override {
@@ -1310,25 +1311,33 @@ struct AAAMDGPUNoAGPR : public StateWrapper<BooleanState, AbstractAttribute> {
   }
 
   const std::string getAsStr(Attributor *A) const override {
-    return getAssumed() ? "amdgpu-no-agpr" : "amdgpu-maybe-agpr";
+    std::string Str = "amdgpu-agpr-alloc=";
+    raw_string_ostream OS(Str);
+    OS << getAssumed();
+    return OS.str();
   }
 
   void trackStatistics() const override {}
 
   ChangeStatus updateImpl(Attributor &A) override {
-    // TODO: Use AACallEdges, but then we need a way to inspect asm edges.
+    DecIntegerState<> Maximum;
 
-    auto CheckForNoAGPRs = [&](Instruction &I) {
+    // Check for cases which require allocation of AGPRs. The only cases where
+    // AGPRs are required are if there are direct references to AGPRs, so inline
+    // assembly and special intrinsics.
+    auto CheckForMinAGPRAllocs = [&](Instruction &I) {
       const auto &CB = cast<CallBase>(I);
       const Value *CalleeOp = CB.getCalledOperand();
-      const Function *Callee = dyn_cast<Function>(CalleeOp);
-      if (!Callee) {
-        if (const InlineAsm *IA = dyn_cast<InlineAsm>(CalleeOp))
-          return inlineAsmGetNumRequiredAGPRs(IA, CB) == 0;
-        return false;
+
+      if (const InlineAsm *IA = dyn_cast<InlineAsm>(CalleeOp)) {
+        // Technically, the inline asm could be invoking a call to an unknown
+        // external function that requires AGPRs, but ignore that.
+        unsigned NumRegs = inlineAsmGetNumRequiredAGPRs(IA, CB);
+        Maximum.takeAssumedMaximum(NumRegs);
+        return true;
       }
 
-      switch (Callee->getIntrinsicID()) {
+      switch (CB.getIntrinsicID()) {
       case Intrinsic::not_intrinsic:
         break;
       case Intrinsic::write_register:
@@ -1340,7 +1349,10 @@ struct AAAMDGPUNoAGPR : public StateWrapper<BooleanState, AbstractAttribute> {
                 ->getOperand(0));
         auto [Kind, RegIdx, NumRegs] =
             AMDGPU::parseAsmPhysRegName(RegName->getString());
-        return Kind != 'a';
+        if (Kind == 'a')
+          Maximum.takeAssumedMaximum(std::min(RegIdx + NumRegs, 256u));
+
+        return true;
       }
       default:
         // Some intrinsics may use AGPRs, but if we have a choice, we are not
@@ -1349,32 +1361,50 @@ struct AAAMDGPUNoAGPR : public StateWrapper<BooleanState, AbstractAttribute> {
       }
 
       // TODO: Handle callsite attributes
-      const auto *CalleeInfo = A.getAAFor<AAAMDGPUNoAGPR>(
-          *this, IRPosition::function(*Callee), DepClassTy::REQUIRED);
-      return CalleeInfo && CalleeInfo->isValidState() &&
-             CalleeInfo->getAssumed();
+      auto *CBEdges = A.getAAFor<AACallEdges>(
+          *this, IRPosition::callsite_function(CB), DepClassTy::REQUIRED);
+      if (!CBEdges || CBEdges->hasUnknownCallee()) {
+        Maximum.indicatePessimisticFixpoint();
+        return false;
+      }
+
+      for (const Function *PossibleCallee : CBEdges->getOptimisticEdges()) {
+        const auto *CalleeInfo = A.getAAFor<AAAMDGPUMinAGPRAlloc>(
+            *this, IRPosition::function(*PossibleCallee), DepClassTy::REQUIRED);
+        if (!CalleeInfo || !CalleeInfo->isValidState()) {
+          Maximum.indicatePessimisticFixpoint();
+          return false;
+        }
+
+        Maximum.takeAssumedMaximum(CalleeInfo->getAssumed());
+      }
+
+      return true;
     };
 
     bool UsedAssumedInformation = false;
-    if (!A.checkForAllCallLikeInstructions(CheckForNoAGPRs, *this,
+    if (!A.checkForAllCallLikeInstructions(CheckForMinAGPRAllocs, *this,
                                            UsedAssumedInformation))
       return indicatePessimisticFixpoint();
-    return ChangeStatus::UNCHANGED;
+
+    return clampStateAndIndicateChange(getState(), Maximum);
   }
 
   ChangeStatus manifest(Attributor &A) override {
-    if (!getAssumed())
-      return ChangeStatus::UNCHANGED;
     LLVMContext &Ctx = getAssociatedFunction()->getContext();
-    return A.manifestAttrs(getIRPosition(),
-                           {Attribute::get(Ctx, "amdgpu-agpr-alloc", "0")});
+    SmallString<4> Buffer;
+    raw_svector_ostream OS(Buffer);
+    OS << getAssumed();
+
+    return A.manifestAttrs(
+        getIRPosition(), {Attribute::get(Ctx, "amdgpu-agpr-alloc", OS.str())});
   }
 
-  StringRef getName() const override { return "AAAMDGPUNoAGPR"; }
+  StringRef getName() const override { return "AAAMDGPUMinAGPRAlloc"; }
   const char *getIdAddr() const override { return &ID; }
 
   /// This function should return true if the type of the \p AA is
-  /// AAAMDGPUNoAGPRs
+  /// AAAMDGPUMinAGPRAllocs
   static bool classof(const AbstractAttribute *AA) {
     return (AA->getIdAddr() == &ID);
   }
@@ -1382,7 +1412,7 @@ struct AAAMDGPUNoAGPR : public StateWrapper<BooleanState, AbstractAttribute> {
   static const char ID;
 };
 
-const char AAAMDGPUNoAGPR::ID = 0;
+const char AAAMDGPUMinAGPRAlloc::ID = 0;
 
 /// An abstract attribute to propagate the function attribute
 /// "amdgpu-cluster-dims" from kernel entry functions to device functions.
@@ -1550,10 +1580,11 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM,
   DenseSet<const char *> Allowed(
       {&AAAMDAttributes::ID, &AAUniformWorkGroupSize::ID,
        &AAPotentialValues::ID, &AAAMDFlatWorkGroupSize::ID,
-       &AAAMDMaxNumWorkgroups::ID, &AAAMDWavesPerEU::ID, &AAAMDGPUNoAGPR::ID,
-       &AACallEdges::ID, &AAPointerInfo::ID, &AAPotentialConstantValues::ID,
-       &AAUnderlyingObjects::ID, &AANoAliasAddrSpace::ID, &AAAddressSpace::ID,
-       &AAIndirectCallInfo::ID, &AAAMDGPUClusterDims::ID});
+       &AAAMDMaxNumWorkgroups::ID, &AAAMDWavesPerEU::ID,
+       &AAAMDGPUMinAGPRAlloc::ID, &AACallEdges::ID, &AAPointerInfo::ID,
+       &AAPotentialConstantValues::ID, &AAUnderlyingObjects::ID,
+       &AANoAliasAddrSpace::ID, &AAAddressSpace::ID, &AAIndirectCallInfo::ID,
+       &AAAMDGPUClusterDims::ID});
 
   AttributorConfig AC(CGUpdater);
   AC.IsClosedWorldModule = Options.IsClosedWorld;
@@ -1595,7 +1626,7 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM,
       A.getOrCreateAAFor<AAAMDGPUClusterDims>(IRPosition::function(*F));
 
     if (ST.hasGFX90AInsts())
-      A.getOrCreateAAFor<AAAMDGPUNoAGPR>(IRPosition::function(*F));
+      A.getOrCreateAAFor<AAAMDGPUMinAGPRAlloc>(IRPosition::function(*F));
 
     for (auto &I : instructions(F)) {
       Value *Ptr = nullptr;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp b/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp
index fedb694..89c16da 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp
@@ -482,12 +482,13 @@ void AMDGPURewriteAGPRCopyMFMAImpl::eliminateSpillsOfReassignedVGPRs() const {
   }
 
   sort(StackIntervals, [](const LiveInterval *A, const LiveInterval *B) {
+    // The ordering has to be strictly weak.
     /// Sort heaviest intervals first to prioritize their unspilling
-    if (A->weight() > B->weight())
-      return true;
+    if (A->weight() != B->weight())
+      return A->weight() > B->weight();
 
-    if (A->getSize() > B->getSize())
-      return true;
+    if (A->getSize() != B->getSize())
+      return A->getSize() > B->getSize();
 
     // Tie breaker by number to avoid need for stable sort
     return A->reg().stackSlotIndex() < B->reg().stackSlotIndex();
diff --git a/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp b/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp
index bc1a3a7..82c43ff 100644
--- a/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp
+++ b/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp
@@ -1507,7 +1507,7 @@ void DXILBitcodeWriter::writeDICompileUnit(const DICompileUnit *N,
                                            SmallVectorImpl<uint64_t> &Record,
                                            unsigned Abbrev) {
   Record.push_back(N->isDistinct());
-  Record.push_back(N->getSourceLanguage());
+  Record.push_back(N->getSourceLanguage().getUnversionedName());
   Record.push_back(VE.getMetadataOrNullID(N->getFile()));
   Record.push_back(VE.getMetadataOrNullID(N->getRawProducer()));
   Record.push_back(N->isOptimized());
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td
index 8d9b777..a29b7dd 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td
@@ -788,32 +788,32 @@ multiclass ShxAdd_UWPat<int i, Instruction shxadd_uw> {
 }
 
 multiclass Sh1Add_UWPat<Instruction sh1add_uw> {
-  def : Pat<(i64 (add_like_non_imm12 (and (shl GPR:$rs1, (i64 1)), 0x1FFFFFFFF),
-                                     (XLenVT GPR:$rs2))),
+  def : Pat<(add_like_non_imm12 (and (shl GPR:$rs1, (i64 1)), (i64 0x1FFFFFFFF)),
+                                     (XLenVT GPR:$rs2)),
             (sh1add_uw GPR:$rs1, GPR:$rs2)>;
   // Use SRLI to clear the LSBs and SHXADD_UW to mask and shift.
-  def : Pat<(i64 (add_like_non_imm12 (and GPR:$rs1, 0x1FFFFFFFE),
-                                     (XLenVT GPR:$rs2))),
+  def : Pat<(add_like_non_imm12 (and GPR:$rs1, (i64 0x1FFFFFFFE)),
+                                (XLenVT GPR:$rs2)),
             (sh1add_uw (XLenVT (SRLI GPR:$rs1, 1)), GPR:$rs2)>;
 }
 
 multiclass Sh2Add_UWPat<Instruction sh2add_uw> {
-  def : Pat<(i64 (add_like_non_imm12 (and (shl GPR:$rs1, (i64 2)), 0x3FFFFFFFF),
-                                     (XLenVT GPR:$rs2))),
+  def : Pat<(add_like_non_imm12 (and (shl GPR:$rs1, (i64 2)), (i64 0x3FFFFFFFF)),
+                                (XLenVT GPR:$rs2)),
             (sh2add_uw GPR:$rs1, GPR:$rs2)>;
   // Use SRLI to clear the LSBs and SHXADD_UW to mask and shift.
-  def : Pat<(i64 (add_like_non_imm12 (and GPR:$rs1, 0x3FFFFFFFC),
-                                     (XLenVT GPR:$rs2))),
+  def : Pat<(add_like_non_imm12 (and GPR:$rs1, (i64 0x3FFFFFFFC)),
+                                (XLenVT GPR:$rs2)),
             (sh2add_uw (XLenVT (SRLI GPR:$rs1, 2)), GPR:$rs2)>;
 }
 
 multiclass Sh3Add_UWPat<Instruction sh3add_uw> {
-  def : Pat<(i64 (add_like_non_imm12 (and (shl GPR:$rs1, (i64 3)), 0x7FFFFFFFF),
-                                     (XLenVT GPR:$rs2))),
+  def : Pat<(add_like_non_imm12 (and (shl GPR:$rs1, (i64 3)), (i64 0x7FFFFFFFF)),
+                                (XLenVT GPR:$rs2)),
             (sh3add_uw GPR:$rs1, GPR:$rs2)>;
   // Use SRLI to clear the LSBs and SHXADD_UW to mask and shift.
-  def : Pat<(i64 (add_like_non_imm12 (and GPR:$rs1, 0x7FFFFFFF8),
-                                     (XLenVT GPR:$rs2))),
+  def : Pat<(add_like_non_imm12 (and GPR:$rs1, (i64 0x7FFFFFFF8)),
+                                (XLenVT GPR:$rs2)),
             (sh3add_uw (XLenVT (SRLI GPR:$rs1, 3)), GPR:$rs2)>;
 }
 
diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.td b/llvm/lib/Target/RISCV/RISCVRegisterInfo.td
index 82e768d..6605a5c 100644
--- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.td
+++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.td
@@ -238,7 +238,7 @@ class RISCVRegisterClass<list<ValueType> regTypes, int align, dag regList>
 }
 
 class GPRRegisterClass<dag regList>
-    : RISCVRegisterClass<[XLenVT, XLenFVT, i32, i16], 32, regList> {
+    : RISCVRegisterClass<[XLenVT, XLenFVT], 32, regList> {
   let RegInfos = XLenRI;
 }
 
diff --git a/llvm/lib/Target/SPIRV/SPIRVEmitNonSemanticDI.cpp b/llvm/lib/Target/SPIRV/SPIRVEmitNonSemanticDI.cpp
index 275463e..318ef06 100644
--- a/llvm/lib/Target/SPIRV/SPIRVEmitNonSemanticDI.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVEmitNonSemanticDI.cpp
@@ -112,7 +112,8 @@ bool SPIRVEmitNonSemanticDI::emitGlobalDI(MachineFunction &MF) {
         FilePaths.emplace_back();
         sys::path::append(FilePaths.back(), File->getDirectory(),
                           File->getFilename());
-        LLVMSourceLanguages.push_back(CompileUnit->getSourceLanguage());
+        LLVMSourceLanguages.push_back(
+            CompileUnit->getSourceLanguage().getUnversionedName());
       }
     }
     const NamedMDNode *ModuleFlags = M->getNamedMetadata("llvm.module.flags");
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
index 6bb064a..526420b 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
@@ -441,7 +441,9 @@ void WebAssemblyAsmPrinter::EmitProducerInfo(Module &M) {
     llvm::SmallSet<StringRef, 4> SeenLanguages;
     for (size_t I = 0, E = Debug->getNumOperands(); I < E; ++I) {
       const auto *CU = cast<DICompileUnit>(Debug->getOperand(I));
-      StringRef Language = dwarf::LanguageString(CU->getSourceLanguage());
+      StringRef Language =
+          dwarf::LanguageString(CU->getSourceLanguage().getUnversionedName());
+
       Language.consume_front("DW_LANG_");
       if (SeenLanguages.insert(Language).second)
         Languages.emplace_back(Language.str(), "");
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
index 1306026..49af78b 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
@@ -1445,6 +1445,49 @@ def : Pat<(v16i8 (wasm_narrow_u (v8i16 V128:$left), (v8i16 V128:$right))),
 def : Pat<(v8i16 (wasm_narrow_u (v4i32 V128:$left), (v4i32 V128:$right))),
           (NARROW_U_I16x8 $left, $right)>;
 
+// Recognize a saturating truncation and convert into the corresponding
+// narrow_TYPE_s or narrow_TYPE_u instruction.
+multiclass SignedSaturatingTruncate<ValueType input, ValueType output,
+                                    Instruction narrow, int minval,
+                                    int maxval, int mask> {
+  def : Pat<
+    (output (wasm_narrow_u
+      (and (smin (smax (input V128:$a), (splat_vector (i32 minval))),
+           (splat_vector (i32 maxval))), (splat_vector (i32 mask))),
+      (and (smin (smax (input V128:$b), (splat_vector (i32 minval))),
+           (splat_vector (i32 maxval))), (splat_vector (i32 mask)))
+    )),
+    (narrow V128:$a, V128:$b)
+  >;
+
+  def : Pat<
+    (output (wasm_narrow_u
+      (and (smax (smin (input V128:$a), (splat_vector (i32 maxval))),
+           (splat_vector (i32 minval))), (splat_vector (i32 mask))),
+      (and (smax (smin (input V128:$b), (splat_vector (i32 maxval))),
+           (splat_vector (i32 minval))), (splat_vector (i32 mask)))
+    )),
+    (narrow V128:$a, V128:$b)
+  >;
+}
+
+defm : SignedSaturatingTruncate<v8i16, v16i8, NARROW_S_I8x16, -128, 127, 0xFF>;
+defm : SignedSaturatingTruncate<v4i32, v8i16, NARROW_S_I16x8, -32768, 32767, 0xFFFF>;
+
+multiclass UnsignedSaturatingTruncate<ValueType input, ValueType output,
+                                    Instruction narrow, int maxval> {
+  def : Pat<
+    (output (wasm_narrow_u
+      (umin (input V128:$a), (splat_vector (i32 maxval))),
+      (umin (input V128:$b), (splat_vector (i32 maxval)))
+    )),
+    (narrow V128:$a, V128:$b)
+  >;
+}
+
+defm : UnsignedSaturatingTruncate<v8i16, v16i8, NARROW_U_I8x16, 0xFF>;
+defm : UnsignedSaturatingTruncate<v4i32, v8i16, NARROW_U_I16x8, 0xFFFF>;
+
 // Bitcasts are nops
 // Matching bitcast t1 to t1 causes strange errors, so avoid repeating types
 foreach t1 = AllVecs in
diff --git a/llvm/lib/Transforms/Coroutines/CoroFrame.cpp b/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
index 0accb22..c89af68 100644
--- a/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
+++ b/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
@@ -689,10 +689,14 @@ static void buildFrameDebugInfo(Function &F, coro::Shape &Shape,
   DISubprogram *DIS = F.getSubprogram();
   // If there is no DISubprogram for F, it implies the function is compiled
   // without debug info. So we also don't generate debug info for the frame.
-  if (!DIS || !DIS->getUnit() ||
-      !dwarf::isCPlusPlus(
-          (dwarf::SourceLanguage)DIS->getUnit()->getSourceLanguage()) ||
-      DIS->getUnit()->getEmissionKind() != DICompileUnit::DebugEmissionKind::FullDebug)
+
+  if (!DIS || !DIS->getUnit())
+    return;
+
+  if (!dwarf::isCPlusPlus(static_cast<llvm::dwarf::SourceLanguage>(
+          DIS->getUnit()->getSourceLanguage().getUnversionedName())) ||
+      DIS->getUnit()->getEmissionKind() !=
+          DICompileUnit::DebugEmissionKind::FullDebug)
     return;
 
   assert(Shape.ABI == coro::ABI::Switch &&
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
index aa030294..127a506 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
@@ -60,6 +60,58 @@ static bool ShrinkDemandedConstant(Instruction *I, unsigned OpNo,
   return true;
 }
 
+/// Let N = 2 * M.
+/// Given an N-bit integer representing a pack of two M-bit integers,
+/// we can select one of the packed integers by right-shifting by either
+/// zero or M (which is the most straightforward to check if M is a power
+/// of 2), and then isolating the lower M bits. In this case, we can
+/// represent the shift as a select on whether the shr amount is nonzero.
+static Value *simplifyShiftSelectingPackedElement(Instruction *I,
+                                                  const APInt &DemandedMask,
+                                                  InstCombinerImpl &IC,
+                                                  unsigned Depth) {
+  assert(I->getOpcode() == Instruction::LShr &&
+         "Only lshr instruction supported");
+
+  uint64_t ShlAmt;
+  Value *Upper, *Lower;
+  if (!match(I->getOperand(0),
+             m_OneUse(m_c_DisjointOr(
+                 m_OneUse(m_Shl(m_Value(Upper), m_ConstantInt(ShlAmt))),
+                 m_Value(Lower)))))
+    return nullptr;
+
+  if (!isPowerOf2_64(ShlAmt))
+    return nullptr;
+
+  const uint64_t DemandedBitWidth = DemandedMask.getActiveBits();
+  if (DemandedBitWidth > ShlAmt)
+    return nullptr;
+
+  // Check that upper demanded bits are not lost from lshift.
+  if (Upper->getType()->getScalarSizeInBits() < ShlAmt + DemandedBitWidth)
+    return nullptr;
+
+  KnownBits KnownLowerBits = IC.computeKnownBits(Lower, I, Depth);
+  if (!KnownLowerBits.getMaxValue().isIntN(ShlAmt))
+    return nullptr;
+
+  Value *ShrAmt = I->getOperand(1);
+  KnownBits KnownShrBits = IC.computeKnownBits(ShrAmt, I, Depth);
+
+  // Verify that ShrAmt is either exactly ShlAmt (which is a power of 2) or
+  // zero.
+  if (~KnownShrBits.Zero != ShlAmt)
+    return nullptr;
+
+  Value *ShrAmtZ =
+      IC.Builder.CreateICmpEQ(ShrAmt, Constant::getNullValue(ShrAmt->getType()),
+                              ShrAmt->getName() + ".z");
+  Value *Select = IC.Builder.CreateSelect(ShrAmtZ, Lower, Upper);
+  Select->takeName(I);
+  return Select;
+}
+
 /// Returns the bitwidth of the given scalar or pointer type. For vector types,
 /// returns the element type's bitwidth.
 static unsigned getBitWidth(Type *Ty, const DataLayout &DL) {
@@ -798,9 +850,13 @@ Value *InstCombinerImpl::SimplifyDemandedUseBits(Instruction *I,
       Known >>= ShiftAmt;
       if (ShiftAmt)
         Known.Zero.setHighBits(ShiftAmt);  // high bits known zero.
-    } else {
-      llvm::computeKnownBits(I, Known, Q, Depth);
+      break;
     }
+    if (Value *V =
+            simplifyShiftSelectingPackedElement(I, DemandedMask, *this, Depth))
+      return V;
+
+    llvm::computeKnownBits(I, Known, Q, Depth);
     break;
   }
   case Instruction::AShr: {
diff --git a/llvm/lib/Transforms/Scalar/GVN.cpp b/llvm/lib/Transforms/Scalar/GVN.cpp
index b9b5b58..638952a 100644
--- a/llvm/lib/Transforms/Scalar/GVN.cpp
+++ b/llvm/lib/Transforms/Scalar/GVN.cpp
@@ -699,6 +699,7 @@ uint32_t GVNPass::ValueTable::lookupOrAdd(Value *V) {
     case Instruction::FPTrunc:
     case Instruction::FPExt:
     case Instruction::PtrToInt:
+    case Instruction::PtrToAddr:
     case Instruction::IntToPtr:
     case Instruction::AddrSpaceCast:
     case Instruction::BitCast:
diff --git a/llvm/lib/Transforms/Scalar/NewGVN.cpp b/llvm/lib/Transforms/Scalar/NewGVN.cpp
index d6b7633..3c1a8ba 100644
--- a/llvm/lib/Transforms/Scalar/NewGVN.cpp
+++ b/llvm/lib/Transforms/Scalar/NewGVN.cpp
@@ -2066,6 +2066,7 @@ NewGVN::performSymbolicEvaluation(Instruction *I,
   case Instruction::FPTrunc:
   case Instruction::FPExt:
   case Instruction::PtrToInt:
+  case Instruction::PtrToAddr:
   case Instruction::IntToPtr:
   case Instruction::Select:
   case Instruction::ExtractElement:
diff --git a/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp b/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp
index 60e5df0..7ffccf7 100644
--- a/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp
+++ b/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp
@@ -355,6 +355,8 @@ void SimplifyCFGPass::printPipeline(
   OS << (Options.ForwardSwitchCondToPhi ? "" : "no-") << "forward-switch-cond;";
   OS << (Options.ConvertSwitchRangeToICmp ? "" : "no-")
      << "switch-range-to-icmp;";
+  OS << (Options.ConvertSwitchToArithmetic ? "" : "no-")
+     << "switch-to-arithmetic;";
   OS << (Options.ConvertSwitchToLookupTable ? "" : "no-")
      << "switch-to-lookup;";
   OS << (Options.NeedCanonicalLoop ? "" : "no-") << "keep-loops;";
diff --git a/llvm/lib/Transforms/Utils/Debugify.cpp b/llvm/lib/Transforms/Utils/Debugify.cpp
index 5a09b73..2923633 100644
--- a/llvm/lib/Transforms/Utils/Debugify.cpp
+++ b/llvm/lib/Transforms/Utils/Debugify.cpp
@@ -19,6 +19,7 @@
 #include "llvm/Config/llvm-config.h"
 #include "llvm/IR/DIBuilder.h"
 #include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/DebugLoc.h"
 #include "llvm/IR/InstIterator.h"
 #include "llvm/IR/Instructions.h"
@@ -162,8 +163,8 @@ bool llvm::applyDebugifyMetadata(
   unsigned NextLine = 1;
   unsigned NextVar = 1;
   auto File = DIB.createFile(M.getName(), "/");
-  auto CU = DIB.createCompileUnit(dwarf::DW_LANG_C, File, "debugify",
-                                  /*isOptimized=*/true, "", 0);
+  auto CU = DIB.createCompileUnit(DISourceLanguageName(dwarf::DW_LANG_C), File,
+                                  "debugify", /*isOptimized=*/true, "", 0);
 
   // Visit each instruction.
   for (Function &F : Functions) {
diff --git a/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp b/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp
index 7cc9ff8..0c8d6fa 100644
--- a/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp
+++ b/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp
@@ -45,12 +45,6 @@ STATISTIC(NumInstrsHoisted,
           "Number of instructions hoisted into loop preheader");
 STATISTIC(NumInstrsDuplicated,
           "Number of instructions cloned into loop preheader");
-STATISTIC(NumRotated, "Number of loops rotated");
-
-static cl::opt<bool>
-    MultiRotate("loop-rotate-multi", cl::init(false), cl::Hidden,
-                cl::desc("Allow loop rotation multiple times in order to reach "
-                         "a better latch exit"));
 
 // Probability that a rotated loop has zero trip count / is never entered.
 static constexpr uint32_t ZeroTripCountWeights[] = {1, 127};
@@ -206,50 +200,6 @@ static bool profitableToRotateLoopExitingLatch(Loop *L) {
   return false;
 }
 
-// Check that latch exit is deoptimizing (which means - very unlikely to happen)
-// and there is another exit from the loop which is non-deoptimizing.
-// If we rotate latch to that exit our loop has a better chance of being fully
-// canonical.
-//
-// It can give false positives in some rare cases.
-static bool canRotateDeoptimizingLatchExit(Loop *L) {
-  BasicBlock *Latch = L->getLoopLatch();
-  assert(Latch && "need latch");
-  BranchInst *BI = dyn_cast<BranchInst>(Latch->getTerminator());
-  // Need normal exiting latch.
-  if (!BI || !BI->isConditional())
-    return false;
-
-  BasicBlock *Exit = BI->getSuccessor(1);
-  if (L->contains(Exit))
-    Exit = BI->getSuccessor(0);
-
-  // Latch exit is non-deoptimizing, no need to rotate.
-  if (!Exit->getPostdominatingDeoptimizeCall())
-    return false;
-
-  SmallVector<BasicBlock *, 4> Exits;
-  L->getUniqueExitBlocks(Exits);
-  if (!Exits.empty()) {
-    // There is at least one non-deoptimizing exit.
-    //
-    // Note, that BasicBlock::getPostdominatingDeoptimizeCall is not exact,
-    // as it can conservatively return false for deoptimizing exits with
-    // complex enough control flow down to deoptimize call.
-    //
-    // That means here we can report success for a case where
-    // all exits are deoptimizing but one of them has complex enough
-    // control flow (e.g. with loops).
-    //
-    // That should be a very rare case and false positives for this function
-    // have compile-time effect only.
-    return any_of(Exits, [](const BasicBlock *BB) {
-      return !BB->getPostdominatingDeoptimizeCall();
-    });
-  }
-  return false;
-}
-
 static void updateBranchWeights(BranchInst &PreHeaderBI, BranchInst &LoopBI,
                                 bool HasConditionalPreHeader,
                                 bool SuccsSwapped) {
@@ -387,506 +337,489 @@ static void updateBranchWeights(BranchInst &PreHeaderBI, BranchInst &LoopBI,
 /// rotation. LoopRotate should be repeatable and converge to a canonical
 /// form. This property is satisfied because simplifying the loop latch can only
 /// happen once across multiple invocations of the LoopRotate pass.
-///
-/// If -loop-rotate-multi is enabled we can do multiple rotations in one go
-/// so to reach a suitable (non-deoptimizing) exit.
 bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
   // If the loop has only one block then there is not much to rotate.
   if (L->getBlocks().size() == 1)
     return false;
 
   bool Rotated = false;
-  do {
-    BasicBlock *OrigHeader = L->getHeader();
-    BasicBlock *OrigLatch = L->getLoopLatch();
-
-    BranchInst *BI = dyn_cast<BranchInst>(OrigHeader->getTerminator());
-    if (!BI || BI->isUnconditional())
-      return Rotated;
-
-    // If the loop header is not one of the loop exiting blocks then
-    // either this loop is already rotated or it is not
-    // suitable for loop rotation transformations.
-    if (!L->isLoopExiting(OrigHeader))
+  BasicBlock *OrigHeader = L->getHeader();
+  BasicBlock *OrigLatch = L->getLoopLatch();
+
+  BranchInst *BI = dyn_cast<BranchInst>(OrigHeader->getTerminator());
+  if (!BI || BI->isUnconditional())
+    return Rotated;
+
+  // If the loop header is not one of the loop exiting blocks then
+  // either this loop is already rotated or it is not
+  // suitable for loop rotation transformations.
+  if (!L->isLoopExiting(OrigHeader))
+    return Rotated;
+
+  // If the loop latch already contains a branch that leaves the loop then the
+  // loop is already rotated.
+  if (!OrigLatch)
+    return Rotated;
+
+  // Rotate if the loop latch was just simplified. Or if it makes the loop exit
+  // count computable. Or if we think it will be profitable.
+  if (L->isLoopExiting(OrigLatch) && !SimplifiedLatch && IsUtilMode == false &&
+      !profitableToRotateLoopExitingLatch(L))
+    return Rotated;
+
+  // Check size of original header and reject loop if it is very big or we can't
+  // duplicate blocks inside it.
+  {
+    SmallPtrSet<const Value *, 32> EphValues;
+    CodeMetrics::collectEphemeralValues(L, AC, EphValues);
+
+    CodeMetrics Metrics;
+    Metrics.analyzeBasicBlock(OrigHeader, *TTI, EphValues, PrepareForLTO);
+    if (Metrics.notDuplicatable) {
+      LLVM_DEBUG(
+          dbgs() << "LoopRotation: NOT rotating - contains non-duplicatable"
+                 << " instructions: ";
+          L->dump());
       return Rotated;
-
-    // If the loop latch already contains a branch that leaves the loop then the
-    // loop is already rotated.
-    if (!OrigLatch)
+    }
+    if (Metrics.Convergence != ConvergenceKind::None) {
+      LLVM_DEBUG(dbgs() << "LoopRotation: NOT rotating - contains convergent "
+                           "instructions: ";
+                 L->dump());
       return Rotated;
-
-    // Rotate if either the loop latch does *not* exit the loop, or if the loop
-    // latch was just simplified. Or if we think it will be profitable.
-    if (L->isLoopExiting(OrigLatch) && !SimplifiedLatch && IsUtilMode == false &&
-        !profitableToRotateLoopExitingLatch(L) &&
-        !canRotateDeoptimizingLatchExit(L))
+    }
+    if (!Metrics.NumInsts.isValid()) {
+      LLVM_DEBUG(dbgs() << "LoopRotation: NOT rotating - contains instructions"
+                           " with invalid cost: ";
+                 L->dump());
       return Rotated;
-
-    // Check size of original header and reject loop if it is very big or we can't
-    // duplicate blocks inside it.
-    {
-      SmallPtrSet<const Value *, 32> EphValues;
-      CodeMetrics::collectEphemeralValues(L, AC, EphValues);
-
-      CodeMetrics Metrics;
-      Metrics.analyzeBasicBlock(OrigHeader, *TTI, EphValues, PrepareForLTO);
-      if (Metrics.notDuplicatable) {
-        LLVM_DEBUG(
-                   dbgs() << "LoopRotation: NOT rotating - contains non-duplicatable"
-                   << " instructions: ";
-                   L->dump());
-        return Rotated;
-      }
-      if (Metrics.Convergence != ConvergenceKind::None) {
-        LLVM_DEBUG(dbgs() << "LoopRotation: NOT rotating - contains convergent "
-                   "instructions: ";
-                   L->dump());
-        return Rotated;
-      }
-      if (!Metrics.NumInsts.isValid()) {
-        LLVM_DEBUG(dbgs() << "LoopRotation: NOT rotating - contains instructions"
-                   " with invalid cost: ";
-                   L->dump());
-        return Rotated;
-      }
-      if (Metrics.NumInsts > MaxHeaderSize) {
-        LLVM_DEBUG(dbgs() << "LoopRotation: NOT rotating - contains "
-                          << Metrics.NumInsts
-                          << " instructions, which is more than the threshold ("
-                          << MaxHeaderSize << " instructions): ";
-                   L->dump());
-        ++NumNotRotatedDueToHeaderSize;
-        return Rotated;
-      }
-
-      // When preparing for LTO, avoid rotating loops with calls that could be
-      // inlined during the LTO stage.
-      if (PrepareForLTO && Metrics.NumInlineCandidates > 0)
-        return Rotated;
     }
-
-    // Now, this loop is suitable for rotation.
-    BasicBlock *OrigPreheader = L->getLoopPreheader();
-
-    // If the loop could not be converted to canonical form, it must have an
-    // indirectbr in it, just give up.
-    if (!OrigPreheader || !L->hasDedicatedExits())
+    if (Metrics.NumInsts > MaxHeaderSize) {
+      LLVM_DEBUG(dbgs() << "LoopRotation: NOT rotating - contains "
+                        << Metrics.NumInsts
+                        << " instructions, which is more than the threshold ("
+                        << MaxHeaderSize << " instructions): ";
+                 L->dump());
+      ++NumNotRotatedDueToHeaderSize;
       return Rotated;
-
-    // Anything ScalarEvolution may know about this loop or the PHI nodes
-    // in its header will soon be invalidated. We should also invalidate
-    // all outer loops because insertion and deletion of blocks that happens
-    // during the rotation may violate invariants related to backedge taken
-    // infos in them.
-    if (SE) {
-      SE->forgetTopmostLoop(L);
-      // We may hoist some instructions out of loop. In case if they were cached
-      // as "loop variant" or "loop computable", these caches must be dropped.
-      // We also may fold basic blocks, so cached block dispositions also need
-      // to be dropped.
-      SE->forgetBlockAndLoopDispositions();
     }
 
-    LLVM_DEBUG(dbgs() << "LoopRotation: rotating "; L->dump());
-    if (MSSAU && VerifyMemorySSA)
-      MSSAU->getMemorySSA()->verifyMemorySSA();
-
-    // Find new Loop header. NewHeader is a Header's one and only successor
-    // that is inside loop.  Header's other successor is outside the
-    // loop.  Otherwise loop is not suitable for rotation.
-    BasicBlock *Exit = BI->getSuccessor(0);
-    BasicBlock *NewHeader = BI->getSuccessor(1);
-    bool BISuccsSwapped = L->contains(Exit);
-    if (BISuccsSwapped)
-      std::swap(Exit, NewHeader);
-    assert(NewHeader && "Unable to determine new loop header");
-    assert(L->contains(NewHeader) && !L->contains(Exit) &&
-           "Unable to determine loop header and exit blocks");
-
-    // This code assumes that the new header has exactly one predecessor.
-    // Remove any single-entry PHI nodes in it.
-    assert(NewHeader->getSinglePredecessor() &&
-           "New header doesn't have one pred!");
-    FoldSingleEntryPHINodes(NewHeader);
-
-    // Begin by walking OrigHeader and populating ValueMap with an entry for
-    // each Instruction.
-    BasicBlock::iterator I = OrigHeader->begin(), E = OrigHeader->end();
-    ValueToValueMapTy ValueMap, ValueMapMSSA;
-
-    // For PHI nodes, the value available in OldPreHeader is just the
-    // incoming value from OldPreHeader.
-    for (; PHINode *PN = dyn_cast<PHINode>(I); ++I)
-      InsertNewValueIntoMap(ValueMap, PN,
-                            PN->getIncomingValueForBlock(OrigPreheader));
-
-    // For the rest of the instructions, either hoist to the OrigPreheader if
-    // possible or create a clone in the OldPreHeader if not.
-    Instruction *LoopEntryBranch = OrigPreheader->getTerminator();
-
-    // Record all debug records preceding LoopEntryBranch to avoid
-    // duplication.
-    using DbgHash =
-        std::pair<std::pair<hash_code, DILocalVariable *>, DIExpression *>;
-    auto makeHash = [](const DbgVariableRecord *D) -> DbgHash {
-      auto VarLocOps = D->location_ops();
-      return {{hash_combine_range(VarLocOps), D->getVariable()},
-              D->getExpression()};
-    };
-
-    SmallDenseSet<DbgHash, 8> DbgRecords;
-    // Build DbgVariableRecord hashes for DbgVariableRecords attached to the
-    // terminator.
-    for (const DbgVariableRecord &DVR :
-         filterDbgVars(OrigPreheader->getTerminator()->getDbgRecordRange()))
-      DbgRecords.insert(makeHash(&DVR));
-
-    // Remember the local noalias scope declarations in the header. After the
-    // rotation, they must be duplicated and the scope must be cloned. This
-    // avoids unwanted interaction across iterations.
-    SmallVector<NoAliasScopeDeclInst *, 6> NoAliasDeclInstructions;
-    for (Instruction &I : *OrigHeader)
-      if (auto *Decl = dyn_cast<NoAliasScopeDeclInst>(&I))
-        NoAliasDeclInstructions.push_back(Decl);
-
-    Module *M = OrigHeader->getModule();
-
-    // Track the next DbgRecord to clone. If we have a sequence where an
-    // instruction is hoisted instead of being cloned:
-    //    DbgRecord blah
-    //    %foo = add i32 0, 0
-    //    DbgRecord xyzzy
-    //    %bar = call i32 @foobar()
-    // where %foo is hoisted, then the DbgRecord "blah" will be seen twice, once
-    // attached to %foo, then when %foo his hoisted it will "fall down" onto the
-    // function call:
-    //    DbgRecord blah
-    //    DbgRecord xyzzy
-    //    %bar = call i32 @foobar()
-    // causing it to appear attached to the call too.
-    //
-    // To avoid this, cloneDebugInfoFrom takes an optional "start cloning from
-    // here" position to account for this behaviour. We point it at any
-    // DbgRecords on the next instruction, here labelled xyzzy, before we hoist
-    // %foo. Later, we only only clone DbgRecords from that position (xyzzy)
-    // onwards, which avoids cloning DbgRecord "blah" multiple times. (Stored as
-    // a range because it gives us a natural way of testing whether
-    //  there were DbgRecords on the next instruction before we hoisted things).
-    iterator_range<DbgRecord::self_iterator> NextDbgInsts =
-        (I != E) ? I->getDbgRecordRange() : DbgMarker::getEmptyDbgRecordRange();
-
-    while (I != E) {
-      Instruction *Inst = &*I++;
-
-      // If the instruction's operands are invariant and it doesn't read or write
-      // memory, then it is safe to hoist.  Doing this doesn't change the order of
-      // execution in the preheader, but does prevent the instruction from
-      // executing in each iteration of the loop.  This means it is safe to hoist
-      // something that might trap, but isn't safe to hoist something that reads
-      // memory (without proving that the loop doesn't write).
-      if (L->hasLoopInvariantOperands(Inst) && !Inst->mayReadFromMemory() &&
-          !Inst->mayWriteToMemory() && !Inst->isTerminator() &&
-          !isa<AllocaInst>(Inst) &&
-          // It is not safe to hoist the value of these instructions in
-          // coroutines, as the addresses of otherwise eligible variables (e.g.
-          // thread-local variables and errno) may change if the coroutine is
-          // resumed in a different thread.Therefore, we disable this
-          // optimization for correctness. However, this may block other correct
-          // optimizations.
-          // FIXME: This should be reverted once we have a better model for
-          // memory access in coroutines.
-          !Inst->getFunction()->isPresplitCoroutine()) {
-
-        if (!NextDbgInsts.empty()) {
-          auto DbgValueRange =
-              LoopEntryBranch->cloneDebugInfoFrom(Inst, NextDbgInsts.begin());
-          RemapDbgRecordRange(M, DbgValueRange, ValueMap,
-                              RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
-          // Erase anything we've seen before.
-          for (DbgVariableRecord &DVR :
-               make_early_inc_range(filterDbgVars(DbgValueRange)))
-            if (DbgRecords.count(makeHash(&DVR)))
-              DVR.eraseFromParent();
-        }
-
-        NextDbgInsts = I->getDbgRecordRange();
-
-        Inst->moveBefore(LoopEntryBranch->getIterator());
+    // When preparing for LTO, avoid rotating loops with calls that could be
+    // inlined during the LTO stage.
+    if (PrepareForLTO && Metrics.NumInlineCandidates > 0)
+      return Rotated;
+  }
 
-        ++NumInstrsHoisted;
-        continue;
-      }
+  // Now, this loop is suitable for rotation.
+  BasicBlock *OrigPreheader = L->getLoopPreheader();
+
+  // If the loop could not be converted to canonical form, it must have an
+  // indirectbr in it, just give up.
+  if (!OrigPreheader || !L->hasDedicatedExits())
+    return Rotated;
+
+  // Anything ScalarEvolution may know about this loop or the PHI nodes
+  // in its header will soon be invalidated. We should also invalidate
+  // all outer loops because insertion and deletion of blocks that happens
+  // during the rotation may violate invariants related to backedge taken
+  // infos in them.
+  if (SE) {
+    SE->forgetTopmostLoop(L);
+    // We may hoist some instructions out of loop. In case if they were cached
+    // as "loop variant" or "loop computable", these caches must be dropped.
+    // We also may fold basic blocks, so cached block dispositions also need
+    // to be dropped.
+    SE->forgetBlockAndLoopDispositions();
+  }
 
-      // Otherwise, create a duplicate of the instruction.
-      Instruction *C = Inst->clone();
-      if (const DebugLoc &DL = C->getDebugLoc())
-        mapAtomInstance(DL, ValueMap);
+  LLVM_DEBUG(dbgs() << "LoopRotation: rotating "; L->dump());
+  if (MSSAU && VerifyMemorySSA)
+    MSSAU->getMemorySSA()->verifyMemorySSA();
 
-      C->insertBefore(LoopEntryBranch->getIterator());
+  // Find new Loop header. NewHeader is a Header's one and only successor
+  // that is inside loop.  Header's other successor is outside the
+  // loop.  Otherwise loop is not suitable for rotation.
+  BasicBlock *Exit = BI->getSuccessor(0);
+  BasicBlock *NewHeader = BI->getSuccessor(1);
+  bool BISuccsSwapped = L->contains(Exit);
+  if (BISuccsSwapped)
+    std::swap(Exit, NewHeader);
+  assert(NewHeader && "Unable to determine new loop header");
+  assert(L->contains(NewHeader) && !L->contains(Exit) &&
+         "Unable to determine loop header and exit blocks");
+
+  // This code assumes that the new header has exactly one predecessor.
+  // Remove any single-entry PHI nodes in it.
+  assert(NewHeader->getSinglePredecessor() &&
+         "New header doesn't have one pred!");
+  FoldSingleEntryPHINodes(NewHeader);
+
+  // Begin by walking OrigHeader and populating ValueMap with an entry for
+  // each Instruction.
+  BasicBlock::iterator I = OrigHeader->begin(), E = OrigHeader->end();
+  ValueToValueMapTy ValueMap, ValueMapMSSA;
+
+  // For PHI nodes, the value available in OldPreHeader is just the
+  // incoming value from OldPreHeader.
+  for (; PHINode *PN = dyn_cast<PHINode>(I); ++I)
+    InsertNewValueIntoMap(ValueMap, PN,
+                          PN->getIncomingValueForBlock(OrigPreheader));
+
+  // For the rest of the instructions, either hoist to the OrigPreheader if
+  // possible or create a clone in the OldPreHeader if not.
+  Instruction *LoopEntryBranch = OrigPreheader->getTerminator();
+
+  // Record all debug records preceding LoopEntryBranch to avoid
+  // duplication.
+  using DbgHash =
+      std::pair<std::pair<hash_code, DILocalVariable *>, DIExpression *>;
+  auto makeHash = [](const DbgVariableRecord *D) -> DbgHash {
+    auto VarLocOps = D->location_ops();
+    return {{hash_combine_range(VarLocOps), D->getVariable()},
+            D->getExpression()};
+  };
 
-      ++NumInstrsDuplicated;
+  SmallDenseSet<DbgHash, 8> DbgRecords;
+  // Build DbgVariableRecord hashes for DbgVariableRecords attached to the
+  // terminator.
+  for (const DbgVariableRecord &DVR :
+       filterDbgVars(OrigPreheader->getTerminator()->getDbgRecordRange()))
+    DbgRecords.insert(makeHash(&DVR));
+
+  // Remember the local noalias scope declarations in the header. After the
+  // rotation, they must be duplicated and the scope must be cloned. This
+  // avoids unwanted interaction across iterations.
+  SmallVector<NoAliasScopeDeclInst *, 6> NoAliasDeclInstructions;
+  for (Instruction &I : *OrigHeader)
+    if (auto *Decl = dyn_cast<NoAliasScopeDeclInst>(&I))
+      NoAliasDeclInstructions.push_back(Decl);
+
+  Module *M = OrigHeader->getModule();
+
+  // Track the next DbgRecord to clone. If we have a sequence where an
+  // instruction is hoisted instead of being cloned:
+  //    DbgRecord blah
+  //    %foo = add i32 0, 0
+  //    DbgRecord xyzzy
+  //    %bar = call i32 @foobar()
+  // where %foo is hoisted, then the DbgRecord "blah" will be seen twice, once
+  // attached to %foo, then when %foo his hoisted it will "fall down" onto the
+  // function call:
+  //    DbgRecord blah
+  //    DbgRecord xyzzy
+  //    %bar = call i32 @foobar()
+  // causing it to appear attached to the call too.
+  //
+  // To avoid this, cloneDebugInfoFrom takes an optional "start cloning from
+  // here" position to account for this behaviour. We point it at any
+  // DbgRecords on the next instruction, here labelled xyzzy, before we hoist
+  // %foo. Later, we only only clone DbgRecords from that position (xyzzy)
+  // onwards, which avoids cloning DbgRecord "blah" multiple times. (Stored as
+  // a range because it gives us a natural way of testing whether
+  //  there were DbgRecords on the next instruction before we hoisted things).
+  iterator_range<DbgRecord::self_iterator> NextDbgInsts =
+      (I != E) ? I->getDbgRecordRange() : DbgMarker::getEmptyDbgRecordRange();
+
+  while (I != E) {
+    Instruction *Inst = &*I++;
+
+    // If the instruction's operands are invariant and it doesn't read or write
+    // memory, then it is safe to hoist.  Doing this doesn't change the order of
+    // execution in the preheader, but does prevent the instruction from
+    // executing in each iteration of the loop.  This means it is safe to hoist
+    // something that might trap, but isn't safe to hoist something that reads
+    // memory (without proving that the loop doesn't write).
+    if (L->hasLoopInvariantOperands(Inst) && !Inst->mayReadFromMemory() &&
+        !Inst->mayWriteToMemory() && !Inst->isTerminator() &&
+        !isa<AllocaInst>(Inst) &&
+        // It is not safe to hoist the value of these instructions in
+        // coroutines, as the addresses of otherwise eligible variables (e.g.
+        // thread-local variables and errno) may change if the coroutine is
+        // resumed in a different thread.Therefore, we disable this
+        // optimization for correctness. However, this may block other correct
+        // optimizations.
+        // FIXME: This should be reverted once we have a better model for
+        // memory access in coroutines.
+        !Inst->getFunction()->isPresplitCoroutine()) {
 
       if (!NextDbgInsts.empty()) {
-        auto Range = C->cloneDebugInfoFrom(Inst, NextDbgInsts.begin());
-        RemapDbgRecordRange(M, Range, ValueMap,
+        auto DbgValueRange =
+            LoopEntryBranch->cloneDebugInfoFrom(Inst, NextDbgInsts.begin());
+        RemapDbgRecordRange(M, DbgValueRange, ValueMap,
                             RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
-        NextDbgInsts = DbgMarker::getEmptyDbgRecordRange();
         // Erase anything we've seen before.
         for (DbgVariableRecord &DVR :
-             make_early_inc_range(filterDbgVars(Range)))
+             make_early_inc_range(filterDbgVars(DbgValueRange)))
           if (DbgRecords.count(makeHash(&DVR)))
             DVR.eraseFromParent();
       }
 
-      // Eagerly remap the operands of the instruction.
-      RemapInstruction(C, ValueMap,
-                       RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
-
-      // With the operands remapped, see if the instruction constant folds or is
-      // otherwise simplifyable.  This commonly occurs because the entry from PHI
-      // nodes allows icmps and other instructions to fold.
-      Value *V = simplifyInstruction(C, SQ);
-      if (V && LI->replacementPreservesLCSSAForm(C, V)) {
-        // If so, then delete the temporary instruction and stick the folded value
-        // in the map.
-        InsertNewValueIntoMap(ValueMap, Inst, V);
-        if (!C->mayHaveSideEffects()) {
-          C->eraseFromParent();
-          C = nullptr;
-        }
-      } else {
-        InsertNewValueIntoMap(ValueMap, Inst, C);
-      }
-      if (C) {
-        // Otherwise, stick the new instruction into the new block!
-        C->setName(Inst->getName());
-
-        if (auto *II = dyn_cast<AssumeInst>(C))
-          AC->registerAssumption(II);
-        // MemorySSA cares whether the cloned instruction was inserted or not, and
-        // not whether it can be remapped to a simplified value.
-        if (MSSAU)
-          InsertNewValueIntoMap(ValueMapMSSA, Inst, C);
-      }
-    }
+      NextDbgInsts = I->getDbgRecordRange();
 
-    if (!NoAliasDeclInstructions.empty()) {
-      // There are noalias scope declarations:
-      // (general):
-      // Original:    OrigPre              { OrigHeader NewHeader ... Latch }
-      // after:      (OrigPre+OrigHeader') { NewHeader ... Latch OrigHeader }
-      //
-      // with D: llvm.experimental.noalias.scope.decl,
-      //      U: !noalias or !alias.scope depending on D
-      //       ... { D U1 U2 }   can transform into:
-      // (0) : ... { D U1 U2 }        // no relevant rotation for this part
-      // (1) : ... D' { U1 U2 D }     // D is part of OrigHeader
-      // (2) : ... D' U1' { U2 D U1 } // D, U1 are part of OrigHeader
-      //
-      // We now want to transform:
-      // (1) -> : ... D' { D U1 U2 D'' }
-      // (2) -> : ... D' U1' { D U2 D'' U1'' }
-      // D: original llvm.experimental.noalias.scope.decl
-      // D', U1': duplicate with replaced scopes
-      // D'', U1'': different duplicate with replaced scopes
-      // This ensures a safe fallback to 'may_alias' introduced by the rotate,
-      // as U1'' and U1' scopes will not be compatible wrt to the local restrict
-
-      // Clone the llvm.experimental.noalias.decl again for the NewHeader.
-      BasicBlock::iterator NewHeaderInsertionPoint =
-          NewHeader->getFirstNonPHIIt();
-      for (NoAliasScopeDeclInst *NAD : NoAliasDeclInstructions) {
-        LLVM_DEBUG(dbgs() << "  Cloning llvm.experimental.noalias.scope.decl:"
-                          << *NAD << "\n");
-        Instruction *NewNAD = NAD->clone();
-        NewNAD->insertBefore(*NewHeader, NewHeaderInsertionPoint);
-      }
+      Inst->moveBefore(LoopEntryBranch->getIterator());
 
-      // Scopes must now be duplicated, once for OrigHeader and once for
-      // OrigPreHeader'.
-      {
-        auto &Context = NewHeader->getContext();
-
-        SmallVector<MDNode *, 8> NoAliasDeclScopes;
-        for (NoAliasScopeDeclInst *NAD : NoAliasDeclInstructions)
-          NoAliasDeclScopes.push_back(NAD->getScopeList());
-
-        LLVM_DEBUG(dbgs() << "  Updating OrigHeader scopes\n");
-        cloneAndAdaptNoAliasScopes(NoAliasDeclScopes, {OrigHeader}, Context,
-                                   "h.rot");
-        LLVM_DEBUG(OrigHeader->dump());
-
-        // Keep the compile time impact low by only adapting the inserted block
-        // of instructions in the OrigPreHeader. This might result in slightly
-        // more aliasing between these instructions and those that were already
-        // present, but it will be much faster when the original PreHeader is
-        // large.
-        LLVM_DEBUG(dbgs() << "  Updating part of OrigPreheader scopes\n");
-        auto *FirstDecl =
-            cast<Instruction>(ValueMap[*NoAliasDeclInstructions.begin()]);
-        auto *LastInst = &OrigPreheader->back();
-        cloneAndAdaptNoAliasScopes(NoAliasDeclScopes, FirstDecl, LastInst,
-                                   Context, "pre.rot");
-        LLVM_DEBUG(OrigPreheader->dump());
-
-        LLVM_DEBUG(dbgs() << "  Updated NewHeader:\n");
-        LLVM_DEBUG(NewHeader->dump());
-      }
+      ++NumInstrsHoisted;
+      continue;
     }
 
-    // Along with all the other instructions, we just cloned OrigHeader's
-    // terminator into OrigPreHeader. Fix up the PHI nodes in each of OrigHeader's
-    // successors by duplicating their incoming values for OrigHeader.
-    for (BasicBlock *SuccBB : successors(OrigHeader))
-      for (BasicBlock::iterator BI = SuccBB->begin();
-           PHINode *PN = dyn_cast<PHINode>(BI); ++BI)
-        PN->addIncoming(PN->getIncomingValueForBlock(OrigHeader), OrigPreheader);
-
-    // Now that OrigPreHeader has a clone of OrigHeader's terminator, remove
-    // OrigPreHeader's old terminator (the original branch into the loop), and
-    // remove the corresponding incoming values from the PHI nodes in OrigHeader.
-    LoopEntryBranch->eraseFromParent();
-    OrigPreheader->flushTerminatorDbgRecords();
-
-    // Update MemorySSA before the rewrite call below changes the 1:1
-    // instruction:cloned_instruction_or_value mapping.
-    if (MSSAU) {
-      InsertNewValueIntoMap(ValueMapMSSA, OrigHeader, OrigPreheader);
-      MSSAU->updateForClonedBlockIntoPred(OrigHeader, OrigPreheader,
-                                          ValueMapMSSA);
-    }
+    // Otherwise, create a duplicate of the instruction.
+    Instruction *C = Inst->clone();
+    if (const DebugLoc &DL = C->getDebugLoc())
+      mapAtomInstance(DL, ValueMap);
 
-    SmallVector<PHINode*, 2> InsertedPHIs;
-    // If there were any uses of instructions in the duplicated block outside the
-    // loop, update them, inserting PHI nodes as required
-    RewriteUsesOfClonedInstructions(OrigHeader, OrigPreheader, ValueMap, SE,
-                                    &InsertedPHIs);
-
-    // Attach debug records to the new phis if that phi uses a value that
-    // previously had debug metadata attached. This keeps the debug info
-    // up-to-date in the loop body.
-    if (!InsertedPHIs.empty())
-      insertDebugValuesForPHIs(OrigHeader, InsertedPHIs);
-
-    // NewHeader is now the header of the loop.
-    L->moveToHeader(NewHeader);
-    assert(L->getHeader() == NewHeader && "Latch block is our new header");
-
-    // Inform DT about changes to the CFG.
-    if (DT) {
-      // The OrigPreheader branches to the NewHeader and Exit now. Then, inform
-      // the DT about the removed edge to the OrigHeader (that got removed).
-      SmallVector<DominatorTree::UpdateType, 3> Updates = {
-          {DominatorTree::Insert, OrigPreheader, Exit},
-          {DominatorTree::Insert, OrigPreheader, NewHeader},
-          {DominatorTree::Delete, OrigPreheader, OrigHeader}};
-
-      if (MSSAU) {
-        MSSAU->applyUpdates(Updates, *DT, /*UpdateDT=*/true);
-        if (VerifyMemorySSA)
-          MSSAU->getMemorySSA()->verifyMemorySSA();
-      } else {
-        DT->applyUpdates(Updates);
-      }
-    }
+    C->insertBefore(LoopEntryBranch->getIterator());
 
-    // At this point, we've finished our major CFG changes.  As part of cloning
-    // the loop into the preheader we've simplified instructions and the
-    // duplicated conditional branch may now be branching on a constant.  If it is
-    // branching on a constant and if that constant means that we enter the loop,
-    // then we fold away the cond branch to an uncond branch.  This simplifies the
-    // loop in cases important for nested loops, and it also means we don't have
-    // to split as many edges.
-    BranchInst *PHBI = cast<BranchInst>(OrigPreheader->getTerminator());
-    assert(PHBI->isConditional() && "Should be clone of BI condbr!");
-    const Value *Cond = PHBI->getCondition();
-    const bool HasConditionalPreHeader =
-        !isa<ConstantInt>(Cond) ||
-        PHBI->getSuccessor(cast<ConstantInt>(Cond)->isZero()) != NewHeader;
-
-    updateBranchWeights(*PHBI, *BI, HasConditionalPreHeader, BISuccsSwapped);
+    ++NumInstrsDuplicated;
 
-    if (HasConditionalPreHeader) {
-      // The conditional branch can't be folded, handle the general case.
-      // Split edges as necessary to preserve LoopSimplify form.
-
-      // Right now OrigPreHeader has two successors, NewHeader and ExitBlock, and
-      // thus is not a preheader anymore.
-      // Split the edge to form a real preheader.
-      BasicBlock *NewPH = SplitCriticalEdge(
-                                            OrigPreheader, NewHeader,
-                                            CriticalEdgeSplittingOptions(DT, LI, MSSAU).setPreserveLCSSA());
-      NewPH->setName(NewHeader->getName() + ".lr.ph");
-
-      // Preserve canonical loop form, which means that 'Exit' should have only
-      // one predecessor. Note that Exit could be an exit block for multiple
-      // nested loops, causing both of the edges to now be critical and need to
-      // be split.
-      SmallVector<BasicBlock *, 4> ExitPreds(predecessors(Exit));
-      bool SplitLatchEdge = false;
-      for (BasicBlock *ExitPred : ExitPreds) {
-        // We only need to split loop exit edges.
-        Loop *PredLoop = LI->getLoopFor(ExitPred);
-        if (!PredLoop || PredLoop->contains(Exit) ||
-            isa<IndirectBrInst>(ExitPred->getTerminator()))
-          continue;
-        SplitLatchEdge |= L->getLoopLatch() == ExitPred;
-        BasicBlock *ExitSplit = SplitCriticalEdge(
-                                                  ExitPred, Exit,
-                                                  CriticalEdgeSplittingOptions(DT, LI, MSSAU).setPreserveLCSSA());
-        ExitSplit->moveBefore(Exit);
+    if (!NextDbgInsts.empty()) {
+      auto Range = C->cloneDebugInfoFrom(Inst, NextDbgInsts.begin());
+      RemapDbgRecordRange(M, Range, ValueMap,
+                          RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
+      NextDbgInsts = DbgMarker::getEmptyDbgRecordRange();
+      // Erase anything we've seen before.
+      for (DbgVariableRecord &DVR : make_early_inc_range(filterDbgVars(Range)))
+        if (DbgRecords.count(makeHash(&DVR)))
+          DVR.eraseFromParent();
+    }
+
+    // Eagerly remap the operands of the instruction.
+    RemapInstruction(C, ValueMap,
+                     RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
+
+    // With the operands remapped, see if the instruction constant folds or is
+    // otherwise simplifyable.  This commonly occurs because the entry from PHI
+    // nodes allows icmps and other instructions to fold.
+    Value *V = simplifyInstruction(C, SQ);
+    if (V && LI->replacementPreservesLCSSAForm(C, V)) {
+      // If so, then delete the temporary instruction and stick the folded value
+      // in the map.
+      InsertNewValueIntoMap(ValueMap, Inst, V);
+      if (!C->mayHaveSideEffects()) {
+        C->eraseFromParent();
+        C = nullptr;
       }
-      assert(SplitLatchEdge &&
-             "Despite splitting all preds, failed to split latch exit?");
-      (void)SplitLatchEdge;
     } else {
-      // We can fold the conditional branch in the preheader, this makes things
-      // simpler. The first step is to remove the extra edge to the Exit block.
-      Exit->removePredecessor(OrigPreheader, true /*preserve LCSSA*/);
-      BranchInst *NewBI = BranchInst::Create(NewHeader, PHBI->getIterator());
-      NewBI->setDebugLoc(PHBI->getDebugLoc());
-      PHBI->eraseFromParent();
+      InsertNewValueIntoMap(ValueMap, Inst, C);
+    }
+    if (C) {
+      // Otherwise, stick the new instruction into the new block!
+      C->setName(Inst->getName());
+
+      if (auto *II = dyn_cast<AssumeInst>(C))
+        AC->registerAssumption(II);
+      // MemorySSA cares whether the cloned instruction was inserted or not, and
+      // not whether it can be remapped to a simplified value.
+      if (MSSAU)
+        InsertNewValueIntoMap(ValueMapMSSA, Inst, C);
+    }
+  }
 
-      // With our CFG finalized, update DomTree if it is available.
-      if (DT) DT->deleteEdge(OrigPreheader, Exit);
+  if (!NoAliasDeclInstructions.empty()) {
+    // There are noalias scope declarations:
+    // (general):
+    // Original:    OrigPre              { OrigHeader NewHeader ... Latch }
+    // after:      (OrigPre+OrigHeader') { NewHeader ... Latch OrigHeader }
+    //
+    // with D: llvm.experimental.noalias.scope.decl,
+    //      U: !noalias or !alias.scope depending on D
+    //       ... { D U1 U2 }   can transform into:
+    // (0) : ... { D U1 U2 }        // no relevant rotation for this part
+    // (1) : ... D' { U1 U2 D }     // D is part of OrigHeader
+    // (2) : ... D' U1' { U2 D U1 } // D, U1 are part of OrigHeader
+    //
+    // We now want to transform:
+    // (1) -> : ... D' { D U1 U2 D'' }
+    // (2) -> : ... D' U1' { D U2 D'' U1'' }
+    // D: original llvm.experimental.noalias.scope.decl
+    // D', U1': duplicate with replaced scopes
+    // D'', U1'': different duplicate with replaced scopes
+    // This ensures a safe fallback to 'may_alias' introduced by the rotate,
+    // as U1'' and U1' scopes will not be compatible wrt to the local restrict
+
+    // Clone the llvm.experimental.noalias.decl again for the NewHeader.
+    BasicBlock::iterator NewHeaderInsertionPoint =
+        NewHeader->getFirstNonPHIIt();
+    for (NoAliasScopeDeclInst *NAD : NoAliasDeclInstructions) {
+      LLVM_DEBUG(dbgs() << "  Cloning llvm.experimental.noalias.scope.decl:"
+                        << *NAD << "\n");
+      Instruction *NewNAD = NAD->clone();
+      NewNAD->insertBefore(*NewHeader, NewHeaderInsertionPoint);
+    }
 
-      // Update MSSA too, if available.
-      if (MSSAU)
-        MSSAU->removeEdge(OrigPreheader, Exit);
+    // Scopes must now be duplicated, once for OrigHeader and once for
+    // OrigPreHeader'.
+    {
+      auto &Context = NewHeader->getContext();
+
+      SmallVector<MDNode *, 8> NoAliasDeclScopes;
+      for (NoAliasScopeDeclInst *NAD : NoAliasDeclInstructions)
+        NoAliasDeclScopes.push_back(NAD->getScopeList());
+
+      LLVM_DEBUG(dbgs() << "  Updating OrigHeader scopes\n");
+      cloneAndAdaptNoAliasScopes(NoAliasDeclScopes, {OrigHeader}, Context,
+                                 "h.rot");
+      LLVM_DEBUG(OrigHeader->dump());
+
+      // Keep the compile time impact low by only adapting the inserted block
+      // of instructions in the OrigPreHeader. This might result in slightly
+      // more aliasing between these instructions and those that were already
+      // present, but it will be much faster when the original PreHeader is
+      // large.
+      LLVM_DEBUG(dbgs() << "  Updating part of OrigPreheader scopes\n");
+      auto *FirstDecl =
+          cast<Instruction>(ValueMap[*NoAliasDeclInstructions.begin()]);
+      auto *LastInst = &OrigPreheader->back();
+      cloneAndAdaptNoAliasScopes(NoAliasDeclScopes, FirstDecl, LastInst,
+                                 Context, "pre.rot");
+      LLVM_DEBUG(OrigPreheader->dump());
+
+      LLVM_DEBUG(dbgs() << "  Updated NewHeader:\n");
+      LLVM_DEBUG(NewHeader->dump());
     }
+  }
 
-    assert(L->getLoopPreheader() && "Invalid loop preheader after loop rotation");
-    assert(L->getLoopLatch() && "Invalid loop latch after loop rotation");
+  // Along with all the other instructions, we just cloned OrigHeader's
+  // terminator into OrigPreHeader. Fix up the PHI nodes in each of OrigHeader's
+  // successors by duplicating their incoming values for OrigHeader.
+  for (BasicBlock *SuccBB : successors(OrigHeader))
+    for (BasicBlock::iterator BI = SuccBB->begin();
+         PHINode *PN = dyn_cast<PHINode>(BI); ++BI)
+      PN->addIncoming(PN->getIncomingValueForBlock(OrigHeader), OrigPreheader);
+
+  // Now that OrigPreHeader has a clone of OrigHeader's terminator, remove
+  // OrigPreHeader's old terminator (the original branch into the loop), and
+  // remove the corresponding incoming values from the PHI nodes in OrigHeader.
+  LoopEntryBranch->eraseFromParent();
+  OrigPreheader->flushTerminatorDbgRecords();
+
+  // Update MemorySSA before the rewrite call below changes the 1:1
+  // instruction:cloned_instruction_or_value mapping.
+  if (MSSAU) {
+    InsertNewValueIntoMap(ValueMapMSSA, OrigHeader, OrigPreheader);
+    MSSAU->updateForClonedBlockIntoPred(OrigHeader, OrigPreheader,
+                                        ValueMapMSSA);
+  }
 
-    if (MSSAU && VerifyMemorySSA)
-      MSSAU->getMemorySSA()->verifyMemorySSA();
+  SmallVector<PHINode *, 2> InsertedPHIs;
+  // If there were any uses of instructions in the duplicated block outside the
+  // loop, update them, inserting PHI nodes as required
+  RewriteUsesOfClonedInstructions(OrigHeader, OrigPreheader, ValueMap, SE,
+                                  &InsertedPHIs);
+
+  // Attach debug records to the new phis if that phi uses a value that
+  // previously had debug metadata attached. This keeps the debug info
+  // up-to-date in the loop body.
+  if (!InsertedPHIs.empty())
+    insertDebugValuesForPHIs(OrigHeader, InsertedPHIs);
+
+  // NewHeader is now the header of the loop.
+  L->moveToHeader(NewHeader);
+  assert(L->getHeader() == NewHeader && "Latch block is our new header");
+
+  // Inform DT about changes to the CFG.
+  if (DT) {
+    // The OrigPreheader branches to the NewHeader and Exit now. Then, inform
+    // the DT about the removed edge to the OrigHeader (that got removed).
+    SmallVector<DominatorTree::UpdateType, 3> Updates = {
+        {DominatorTree::Insert, OrigPreheader, Exit},
+        {DominatorTree::Insert, OrigPreheader, NewHeader},
+        {DominatorTree::Delete, OrigPreheader, OrigHeader}};
 
-    // Now that the CFG and DomTree are in a consistent state again, try to merge
-    // the OrigHeader block into OrigLatch.  This will succeed if they are
-    // connected by an unconditional branch.  This is just a cleanup so the
-    // emitted code isn't too gross in this common case.
-    DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
-    BasicBlock *PredBB = OrigHeader->getUniquePredecessor();
-    bool DidMerge = MergeBlockIntoPredecessor(OrigHeader, &DTU, LI, MSSAU);
-    if (DidMerge)
-      RemoveRedundantDbgInstrs(PredBB);
+    if (MSSAU) {
+      MSSAU->applyUpdates(Updates, *DT, /*UpdateDT=*/true);
+      if (VerifyMemorySSA)
+        MSSAU->getMemorySSA()->verifyMemorySSA();
+    } else {
+      DT->applyUpdates(Updates);
+    }
+  }
 
-    if (MSSAU && VerifyMemorySSA)
-      MSSAU->getMemorySSA()->verifyMemorySSA();
+  // At this point, we've finished our major CFG changes.  As part of cloning
+  // the loop into the preheader we've simplified instructions and the
+  // duplicated conditional branch may now be branching on a constant.  If it is
+  // branching on a constant and if that constant means that we enter the loop,
+  // then we fold away the cond branch to an uncond branch.  This simplifies the
+  // loop in cases important for nested loops, and it also means we don't have
+  // to split as many edges.
+  BranchInst *PHBI = cast<BranchInst>(OrigPreheader->getTerminator());
+  assert(PHBI->isConditional() && "Should be clone of BI condbr!");
+  const Value *Cond = PHBI->getCondition();
+  const bool HasConditionalPreHeader =
+      !isa<ConstantInt>(Cond) ||
+      PHBI->getSuccessor(cast<ConstantInt>(Cond)->isZero()) != NewHeader;
+
+  updateBranchWeights(*PHBI, *BI, HasConditionalPreHeader, BISuccsSwapped);
 
-    LLVM_DEBUG(dbgs() << "LoopRotation: into "; L->dump());
+  if (HasConditionalPreHeader) {
+    // The conditional branch can't be folded, handle the general case.
+    // Split edges as necessary to preserve LoopSimplify form.
+
+    // Right now OrigPreHeader has two successors, NewHeader and ExitBlock, and
+    // thus is not a preheader anymore.
+    // Split the edge to form a real preheader.
+    BasicBlock *NewPH = SplitCriticalEdge(
+        OrigPreheader, NewHeader,
+        CriticalEdgeSplittingOptions(DT, LI, MSSAU).setPreserveLCSSA());
+    NewPH->setName(NewHeader->getName() + ".lr.ph");
+
+    // Preserve canonical loop form, which means that 'Exit' should have only
+    // one predecessor. Note that Exit could be an exit block for multiple
+    // nested loops, causing both of the edges to now be critical and need to
+    // be split.
+    SmallVector<BasicBlock *, 4> ExitPreds(predecessors(Exit));
+    bool SplitLatchEdge = false;
+    for (BasicBlock *ExitPred : ExitPreds) {
+      // We only need to split loop exit edges.
+      Loop *PredLoop = LI->getLoopFor(ExitPred);
+      if (!PredLoop || PredLoop->contains(Exit) ||
+          isa<IndirectBrInst>(ExitPred->getTerminator()))
+        continue;
+      SplitLatchEdge |= L->getLoopLatch() == ExitPred;
+      BasicBlock *ExitSplit = SplitCriticalEdge(
+          ExitPred, Exit,
+          CriticalEdgeSplittingOptions(DT, LI, MSSAU).setPreserveLCSSA());
+      ExitSplit->moveBefore(Exit);
+    }
+    assert(SplitLatchEdge &&
+           "Despite splitting all preds, failed to split latch exit?");
+    (void)SplitLatchEdge;
+  } else {
+    // We can fold the conditional branch in the preheader, this makes things
+    // simpler. The first step is to remove the extra edge to the Exit block.
+    Exit->removePredecessor(OrigPreheader, true /*preserve LCSSA*/);
+    BranchInst *NewBI = BranchInst::Create(NewHeader, PHBI->getIterator());
+    NewBI->setDebugLoc(PHBI->getDebugLoc());
+    PHBI->eraseFromParent();
+
+    // With our CFG finalized, update DomTree if it is available.
+    if (DT)
+      DT->deleteEdge(OrigPreheader, Exit);
+
+    // Update MSSA too, if available.
+    if (MSSAU)
+      MSSAU->removeEdge(OrigPreheader, Exit);
+  }
 
-    ++NumRotated;
+  assert(L->getLoopPreheader() && "Invalid loop preheader after loop rotation");
+  assert(L->getLoopLatch() && "Invalid loop latch after loop rotation");
 
-    Rotated = true;
-    SimplifiedLatch = false;
+  if (MSSAU && VerifyMemorySSA)
+    MSSAU->getMemorySSA()->verifyMemorySSA();
+
+  // Now that the CFG and DomTree are in a consistent state again, try to merge
+  // the OrigHeader block into OrigLatch.  This will succeed if they are
+  // connected by an unconditional branch.  This is just a cleanup so the
+  // emitted code isn't too gross in this common case.
+  DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
+  BasicBlock *PredBB = OrigHeader->getUniquePredecessor();
+  bool DidMerge = MergeBlockIntoPredecessor(OrigHeader, &DTU, LI, MSSAU);
+  if (DidMerge)
+    RemoveRedundantDbgInstrs(PredBB);
 
-    // Check that new latch is a deoptimizing exit and then repeat rotation if possible.
-    // Deoptimizing latch exit is not a generally typical case, so we just loop over.
-    // TODO: if it becomes a performance bottleneck extend rotation algorithm
-    // to handle multiple rotations in one go.
-  } while (MultiRotate && canRotateDeoptimizingLatchExit(L));
+  if (MSSAU && VerifyMemorySSA)
+    MSSAU->getMemorySSA()->verifyMemorySSA();
 
+  LLVM_DEBUG(dbgs() << "LoopRotation: into "; L->dump());
 
   return true;
 }
diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index b8cfe3a..155fcc5 100644
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -6642,6 +6642,9 @@ public:
   /// Return true if the replacement is a lookup table.
   bool isLookupTable();
 
+  /// Return true if the replacement is a bit map.
+  bool isBitMap();
+
 private:
   // Depending on the switch, there are different alternatives.
   enum {
@@ -6932,6 +6935,8 @@ Constant *SwitchReplacement::getDefaultValue() { return DefaultValue; }
 
 bool SwitchReplacement::isLookupTable() { return Kind == LookupTableKind; }
 
+bool SwitchReplacement::isBitMap() { return Kind == BitMapKind; }
+
 static bool isSwitchDense(uint64_t NumCases, uint64_t CaseRange) {
   // 40% is the default density for building a jump table in optsize/minsize
   // mode. See also TargetLoweringBase::isSuitableForJumpTable(), which this
@@ -7097,7 +7102,8 @@ static void reuseTableCompare(
 /// lookup tables.
 static bool simplifySwitchLookup(SwitchInst *SI, IRBuilder<> &Builder,
                                  DomTreeUpdater *DTU, const DataLayout &DL,
-                                 const TargetTransformInfo &TTI) {
+                                 const TargetTransformInfo &TTI,
+                                 bool ConvertSwitchToLookupTable) {
   assert(SI->getNumCases() > 1 && "Degenerate switch?");
 
   BasicBlock *BB = SI->getParent();
@@ -7262,6 +7268,8 @@ static bool simplifySwitchLookup(SwitchInst *SI, IRBuilder<> &Builder,
 
   bool AnyLookupTables = any_of(
       PhiToReplacementMap, [](auto &KV) { return KV.second.isLookupTable(); });
+  bool AnyBitMaps = any_of(PhiToReplacementMap,
+                           [](auto &KV) { return KV.second.isBitMap(); });
 
   // A few conditions prevent the generation of lookup tables:
   //     1. The target does not support lookup tables.
@@ -7274,6 +7282,12 @@ static bool simplifySwitchLookup(SwitchInst *SI, IRBuilder<> &Builder,
        Fn->getFnAttribute("no-jump-tables").getValueAsBool()))
     return false;
 
+  // In the early optimization pipeline, disable formation of lookup tables,
+  // bit maps and mask checks, as they may inhibit further optimization.
+  if (!ConvertSwitchToLookupTable &&
+      (AnyLookupTables || AnyBitMaps || NeedMask))
+    return false;
+
   Builder.SetInsertPoint(SI);
   // TableIndex is the switch condition - TableIndexOffset if we don't
   // use the condition directly
@@ -7929,14 +7943,13 @@ bool SimplifyCFGOpt::simplifySwitch(SwitchInst *SI, IRBuilder<> &Builder) {
   if (Options.ForwardSwitchCondToPhi && forwardSwitchConditionToPHI(SI))
     return requestResimplify();
 
-  // The conversion from switch to lookup tables results in difficult-to-analyze
-  // code and makes pruning branches much harder. This is a problem if the
-  // switch expression itself can still be restricted as a result of inlining or
-  // CVP. Therefore, only apply this transformation during late stages of the
-  // optimisation pipeline.
-  if (Options.ConvertSwitchToLookupTable &&
-      simplifySwitchLookup(SI, Builder, DTU, DL, TTI))
-    return requestResimplify();
+  // The conversion of switches to arithmetic or lookup table is disabled in
+  // the early optimization pipeline, as it may lose information or make the
+  // resulting code harder to analyze.
+  if (Options.ConvertSwitchToArithmetic || Options.ConvertSwitchToLookupTable)
+    if (simplifySwitchLookup(SI, Builder, DTU, DL, TTI,
+                             Options.ConvertSwitchToLookupTable))
+      return requestResimplify();
 
   if (simplifySwitchOfPowersOfTwo(SI, Builder, DL, TTI))
     return requestResimplify();
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 3a9770c..600ff8a 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -3141,7 +3141,7 @@ static bool isUsedByLoadStoreAddress(const VPUser *V) {
 
   while (!WorkList.empty()) {
     auto *Cur = dyn_cast<VPSingleDefRecipe>(WorkList.pop_back_val());
-    if (!Cur || !Seen.insert(Cur).second)
+    if (!Cur || !Seen.insert(Cur).second || isa<VPBlendRecipe>(Cur))
       continue;
 
     for (VPUser *U : Cur->users()) {
diff --git a/llvm/test/Analysis/ScalarEvolution/trip-multiple-guard-info.ll b/llvm/test/Analysis/ScalarEvolution/trip-multiple-guard-info.ll
index b1fe7b1..7ba422d 100644
--- a/llvm/test/Analysis/ScalarEvolution/trip-multiple-guard-info.ll
+++ b/llvm/test/Analysis/ScalarEvolution/trip-multiple-guard-info.ll
@@ -615,22 +615,14 @@ define void @test_ptrs_aligned_by_4_via_assumption(ptr %start, ptr %end) {
 ; CHECK-LABEL: 'test_ptrs_aligned_by_4_via_assumption'
 ; CHECK-NEXT:  Classifying expressions for: @test_ptrs_aligned_by_4_via_assumption
 ; CHECK-NEXT:    %iv = phi ptr [ %start, %entry ], [ %iv.next, %loop ]
-; CHECK-NEXT:    --> {%start,+,4}<%loop> U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    --> {%start,+,4}<%loop> U: full-set S: full-set Exits: ((4 * ((-4 + (-1 * (ptrtoint ptr %start to i64)) + (ptrtoint ptr %end to i64)) /u 4))<nuw> + %start) LoopDispositions: { %loop: Computable }
 ; CHECK-NEXT:    %iv.next = getelementptr i8, ptr %iv, i64 4
-; CHECK-NEXT:    --> {(4 + %start),+,4}<%loop> U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    --> {(4 + %start),+,4}<%loop> U: full-set S: full-set Exits: (4 + (4 * ((-4 + (-1 * (ptrtoint ptr %start to i64)) + (ptrtoint ptr %end to i64)) /u 4))<nuw> + %start) LoopDispositions: { %loop: Computable }
 ; CHECK-NEXT:  Determining loop execution counts for: @test_ptrs_aligned_by_4_via_assumption
-; CHECK-NEXT:  Loop %loop: Unpredictable backedge-taken count.
-; CHECK-NEXT:  Loop %loop: Unpredictable constant max backedge-taken count.
-; CHECK-NEXT:  Loop %loop: Unpredictable symbolic max backedge-taken count.
-; CHECK-NEXT:  Loop %loop: Predicated backedge-taken count is ((-4 + (-1 * (ptrtoint ptr %start to i64)) + (ptrtoint ptr %end to i64)) /u 4)
-; CHECK-NEXT:   Predicates:
-; CHECK-NEXT:      Equal predicate: (zext i2 ((trunc i64 (ptrtoint ptr %end to i64) to i2) + (-1 * (trunc i64 (ptrtoint ptr %start to i64) to i2))) to i64) == 0
-; CHECK-NEXT:  Loop %loop: Predicated constant max backedge-taken count is i64 4611686018427387903
-; CHECK-NEXT:   Predicates:
-; CHECK-NEXT:      Equal predicate: (zext i2 ((trunc i64 (ptrtoint ptr %end to i64) to i2) + (-1 * (trunc i64 (ptrtoint ptr %start to i64) to i2))) to i64) == 0
-; CHECK-NEXT:  Loop %loop: Predicated symbolic max backedge-taken count is ((-4 + (-1 * (ptrtoint ptr %start to i64)) + (ptrtoint ptr %end to i64)) /u 4)
-; CHECK-NEXT:   Predicates:
-; CHECK-NEXT:      Equal predicate: (zext i2 ((trunc i64 (ptrtoint ptr %end to i64) to i2) + (-1 * (trunc i64 (ptrtoint ptr %start to i64) to i2))) to i64) == 0
+; CHECK-NEXT:  Loop %loop: backedge-taken count is ((-4 + (-1 * (ptrtoint ptr %start to i64)) + (ptrtoint ptr %end to i64)) /u 4)
+; CHECK-NEXT:  Loop %loop: constant max backedge-taken count is i64 4611686018427387903
+; CHECK-NEXT:  Loop %loop: symbolic max backedge-taken count is ((-4 + (-1 * (ptrtoint ptr %start to i64)) + (ptrtoint ptr %end to i64)) /u 4)
+; CHECK-NEXT:  Loop %loop: Trip multiple is 1
 ;
 entry:
   call void @llvm.assume(i1 true) [ "align"(ptr %start, i64 4) ]
@@ -652,22 +644,14 @@ define void @test_ptrs_aligned_by_8_via_assumption(ptr %start, ptr %end) {
 ; CHECK-LABEL: 'test_ptrs_aligned_by_8_via_assumption'
 ; CHECK-NEXT:  Classifying expressions for: @test_ptrs_aligned_by_8_via_assumption
 ; CHECK-NEXT:    %iv = phi ptr [ %start, %entry ], [ %iv.next, %loop ]
-; CHECK-NEXT:    --> {%start,+,4}<%loop> U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    --> {%start,+,4}<%loop> U: full-set S: full-set Exits: ((4 * ((-4 + (-1 * (ptrtoint ptr %start to i64)) + (ptrtoint ptr %end to i64)) /u 4))<nuw> + %start) LoopDispositions: { %loop: Computable }
 ; CHECK-NEXT:    %iv.next = getelementptr i8, ptr %iv, i64 4
-; CHECK-NEXT:    --> {(4 + %start),+,4}<%loop> U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    --> {(4 + %start),+,4}<%loop> U: full-set S: full-set Exits: (4 + (4 * ((-4 + (-1 * (ptrtoint ptr %start to i64)) + (ptrtoint ptr %end to i64)) /u 4))<nuw> + %start) LoopDispositions: { %loop: Computable }
 ; CHECK-NEXT:  Determining loop execution counts for: @test_ptrs_aligned_by_8_via_assumption
-; CHECK-NEXT:  Loop %loop: Unpredictable backedge-taken count.
-; CHECK-NEXT:  Loop %loop: Unpredictable constant max backedge-taken count.
-; CHECK-NEXT:  Loop %loop: Unpredictable symbolic max backedge-taken count.
-; CHECK-NEXT:  Loop %loop: Predicated backedge-taken count is ((-4 + (-1 * (ptrtoint ptr %start to i64)) + (ptrtoint ptr %end to i64)) /u 4)
-; CHECK-NEXT:   Predicates:
-; CHECK-NEXT:      Equal predicate: (zext i2 ((trunc i64 (ptrtoint ptr %end to i64) to i2) + (-1 * (trunc i64 (ptrtoint ptr %start to i64) to i2))) to i64) == 0
-; CHECK-NEXT:  Loop %loop: Predicated constant max backedge-taken count is i64 4611686018427387903
-; CHECK-NEXT:   Predicates:
-; CHECK-NEXT:      Equal predicate: (zext i2 ((trunc i64 (ptrtoint ptr %end to i64) to i2) + (-1 * (trunc i64 (ptrtoint ptr %start to i64) to i2))) to i64) == 0
-; CHECK-NEXT:  Loop %loop: Predicated symbolic max backedge-taken count is ((-4 + (-1 * (ptrtoint ptr %start to i64)) + (ptrtoint ptr %end to i64)) /u 4)
-; CHECK-NEXT:   Predicates:
-; CHECK-NEXT:      Equal predicate: (zext i2 ((trunc i64 (ptrtoint ptr %end to i64) to i2) + (-1 * (trunc i64 (ptrtoint ptr %start to i64) to i2))) to i64) == 0
+; CHECK-NEXT:  Loop %loop: backedge-taken count is ((-4 + (-1 * (ptrtoint ptr %start to i64)) + (ptrtoint ptr %end to i64)) /u 4)
+; CHECK-NEXT:  Loop %loop: constant max backedge-taken count is i64 4611686018427387903
+; CHECK-NEXT:  Loop %loop: symbolic max backedge-taken count is ((-4 + (-1 * (ptrtoint ptr %start to i64)) + (ptrtoint ptr %end to i64)) /u 4)
+; CHECK-NEXT:  Loop %loop: Trip multiple is 1
 ;
 entry:
   call void @llvm.assume(i1 true) [ "align"(ptr %start, i64 8) ]
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-no-agpr.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-min-agpr-alloc.ll
index 2ad6e68..f730199 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-no-agpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-min-agpr-alloc.ll
@@ -70,7 +70,7 @@ define amdgpu_kernel void @kernel_uses_asm_virtreg_def() {
 
 define amdgpu_kernel void @kernel_uses_asm_physreg_def_tuple() {
 ; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_asm_physreg_def_tuple(
-; CHECK-SAME: ) #[[ATTR1]] {
+; CHECK-SAME: ) #[[ATTR2:[0-9]+]] {
 ; CHECK-NEXT:    [[DEF:%.*]] = call i64 asm sideeffect "
 ; CHECK-NEXT:    call void @use_most()
 ; CHECK-NEXT:    ret void
@@ -118,7 +118,7 @@ define amdgpu_kernel void @kernel_uses_asm_physreg() {
 
 define amdgpu_kernel void @kernel_uses_asm_physreg_tuple() {
 ; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_asm_physreg_tuple(
-; CHECK-SAME: ) #[[ATTR1]] {
+; CHECK-SAME: ) #[[ATTR2]] {
 ; CHECK-NEXT:    call void asm sideeffect "
 ; CHECK-NEXT:    call void @use_most()
 ; CHECK-NEXT:    ret void
@@ -154,7 +154,7 @@ define void @func_uses_asm_physreg_agpr() {
 
 define void @func_uses_asm_physreg_agpr_tuple() {
 ; CHECK-LABEL: define void @func_uses_asm_physreg_agpr_tuple(
-; CHECK-SAME: ) #[[ATTR1]] {
+; CHECK-SAME: ) #[[ATTR2]] {
 ; CHECK-NEXT:    call void asm sideeffect "
 ; CHECK-NEXT:    call void @use_most()
 ; CHECK-NEXT:    ret void
@@ -168,7 +168,7 @@ declare void @unknown()
 
 define amdgpu_kernel void @kernel_calls_extern() {
 ; CHECK-LABEL: define amdgpu_kernel void @kernel_calls_extern(
-; CHECK-SAME: ) #[[ATTR1]] {
+; CHECK-SAME: ) #[[ATTR3:[0-9]+]] {
 ; CHECK-NEXT:    call void @unknown()
 ; CHECK-NEXT:    call void @use_most()
 ; CHECK-NEXT:    ret void
@@ -180,8 +180,8 @@ define amdgpu_kernel void @kernel_calls_extern() {
 
 define amdgpu_kernel void @kernel_calls_extern_marked_callsite() {
 ; CHECK-LABEL: define amdgpu_kernel void @kernel_calls_extern_marked_callsite(
-; CHECK-SAME: ) #[[ATTR1]] {
-; CHECK-NEXT:    call void @unknown() #[[ATTR10:[0-9]+]]
+; CHECK-SAME: ) #[[ATTR3]] {
+; CHECK-NEXT:    call void @unknown() #[[ATTR29:[0-9]+]]
 ; CHECK-NEXT:    call void @use_most()
 ; CHECK-NEXT:    ret void
 ;
@@ -192,7 +192,7 @@ define amdgpu_kernel void @kernel_calls_extern_marked_callsite() {
 
 define amdgpu_kernel void @kernel_calls_indirect(ptr %indirect) {
 ; CHECK-LABEL: define amdgpu_kernel void @kernel_calls_indirect(
-; CHECK-SAME: ptr [[INDIRECT:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: ptr [[INDIRECT:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:    call void [[INDIRECT]]()
 ; CHECK-NEXT:    call void @use_most()
 ; CHECK-NEXT:    ret void
@@ -204,8 +204,8 @@ define amdgpu_kernel void @kernel_calls_indirect(ptr %indirect) {
 
 define amdgpu_kernel void @kernel_calls_indirect_marked_callsite(ptr %indirect) {
 ; CHECK-LABEL: define amdgpu_kernel void @kernel_calls_indirect_marked_callsite(
-; CHECK-SAME: ptr [[INDIRECT:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    call void [[INDIRECT]]() #[[ATTR10]]
+; CHECK-SAME: ptr [[INDIRECT:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    call void [[INDIRECT]]() #[[ATTR29]]
 ; CHECK-NEXT:    call void @use_most()
 ; CHECK-NEXT:    ret void
 ;
@@ -316,7 +316,7 @@ define amdgpu_kernel void @kernel_calls_workitem_id_x(ptr addrspace(1) %out) {
 
 define amdgpu_kernel void @indirect_calls_none_agpr(i1 %cond) {
 ; CHECK-LABEL: define amdgpu_kernel void @indirect_calls_none_agpr(
-; CHECK-SAME: i1 [[COND:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: i1 [[COND:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[FPTR:%.*]] = select i1 [[COND]], ptr @empty, ptr @also_empty
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq ptr [[FPTR]], @also_empty
 ; CHECK-NEXT:    br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP3:%.*]]
@@ -342,7 +342,7 @@ define amdgpu_kernel void @indirect_calls_none_agpr(i1 %cond) {
 
 define amdgpu_kernel void @kernel_uses_asm_virtreg_def_struct_0() {
 ; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_asm_virtreg_def_struct_0(
-; CHECK-SAME: ) #[[ATTR1]] {
+; CHECK-SAME: ) #[[ATTR2]] {
 ; CHECK-NEXT:    [[DEF:%.*]] = call { i32, i32 } asm sideeffect "
 ; CHECK-NEXT:    call void @use_most()
 ; CHECK-NEXT:    ret void
@@ -354,7 +354,7 @@ define amdgpu_kernel void @kernel_uses_asm_virtreg_def_struct_0() {
 
 define amdgpu_kernel void @kernel_uses_asm_virtreg_use_struct_1() {
 ; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_asm_virtreg_use_struct_1(
-; CHECK-SAME: ) #[[ATTR1]] {
+; CHECK-SAME: ) #[[ATTR5:[0-9]+]] {
 ; CHECK-NEXT:    [[DEF:%.*]] = call { i32, <2 x i32> } asm sideeffect "
 ; CHECK-NEXT:    call void @use_most()
 ; CHECK-NEXT:    ret void
@@ -378,7 +378,7 @@ define amdgpu_kernel void @kernel_uses_asm_virtreg_use_struct_2() {
 
 define amdgpu_kernel void @kernel_uses_asm_virtreg_ptr_ty() {
 ; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_asm_virtreg_ptr_ty(
-; CHECK-SAME: ) #[[ATTR1]] {
+; CHECK-SAME: ) #[[ATTR2]] {
 ; CHECK-NEXT:    call void asm sideeffect "
 ; CHECK-NEXT:    call void @use_most()
 ; CHECK-NEXT:    ret void
@@ -390,7 +390,7 @@ define amdgpu_kernel void @kernel_uses_asm_virtreg_ptr_ty() {
 
 define amdgpu_kernel void @kernel_uses_asm_virtreg_def_ptr_ty() {
 ; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_asm_virtreg_def_ptr_ty(
-; CHECK-SAME: ) #[[ATTR1]] {
+; CHECK-SAME: ) #[[ATTR2]] {
 ; CHECK-NEXT:    [[DEF:%.*]] = call ptr asm sideeffect "
 ; CHECK-NEXT:    call void @use_most()
 ; CHECK-NEXT:    ret void
@@ -402,7 +402,7 @@ define amdgpu_kernel void @kernel_uses_asm_virtreg_def_ptr_ty() {
 
 define amdgpu_kernel void @kernel_uses_asm_virtreg_def_vector_ptr_ty() {
 ; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_asm_virtreg_def_vector_ptr_ty(
-; CHECK-SAME: ) #[[ATTR1]] {
+; CHECK-SAME: ) #[[ATTR5]] {
 ; CHECK-NEXT:    [[DEF:%.*]] = call <2 x ptr> asm sideeffect "
 ; CHECK-NEXT:    call void @use_most()
 ; CHECK-NEXT:    ret void
@@ -414,7 +414,7 @@ define amdgpu_kernel void @kernel_uses_asm_virtreg_def_vector_ptr_ty() {
 
 define amdgpu_kernel void @kernel_uses_asm_physreg_def_struct_0() {
 ; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_asm_physreg_def_struct_0(
-; CHECK-SAME: ) #[[ATTR1]] {
+; CHECK-SAME: ) #[[ATTR6:[0-9]+]] {
 ; CHECK-NEXT:    [[DEF:%.*]] = call { i32, i32 } asm sideeffect "
 ; CHECK-NEXT:    call void @use_most()
 ; CHECK-NEXT:    ret void
@@ -426,7 +426,7 @@ define amdgpu_kernel void @kernel_uses_asm_physreg_def_struct_0() {
 
 define amdgpu_kernel void @kernel_uses_asm_clobber() {
 ; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_asm_clobber(
-; CHECK-SAME: ) #[[ATTR1]] {
+; CHECK-SAME: ) #[[ATTR7:[0-9]+]] {
 ; CHECK-NEXT:    call void asm sideeffect "
 ; CHECK-NEXT:    call void @use_most()
 ; CHECK-NEXT:    ret void
@@ -438,7 +438,7 @@ define amdgpu_kernel void @kernel_uses_asm_clobber() {
 
 define amdgpu_kernel void @kernel_uses_asm_clobber_tuple() {
 ; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_asm_clobber_tuple(
-; CHECK-SAME: ) #[[ATTR1]] {
+; CHECK-SAME: ) #[[ATTR8:[0-9]+]] {
 ; CHECK-NEXT:    call void asm sideeffect "
 ; CHECK-NEXT:    call void @use_most()
 ; CHECK-NEXT:    ret void
@@ -450,7 +450,7 @@ define amdgpu_kernel void @kernel_uses_asm_clobber_tuple() {
 
 define amdgpu_kernel void @kernel_uses_asm_clobber_oob() {
 ; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_asm_clobber_oob(
-; CHECK-SAME: ) #[[ATTR1]] {
+; CHECK-SAME: ) #[[ATTR9:[0-9]+]] {
 ; CHECK-NEXT:    call void asm sideeffect "
 ; CHECK-NEXT:    call void @use_most()
 ; CHECK-NEXT:    ret void
@@ -462,7 +462,7 @@ define amdgpu_kernel void @kernel_uses_asm_clobber_oob() {
 
 define amdgpu_kernel void @kernel_uses_asm_clobber_max() {
 ; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_asm_clobber_max(
-; CHECK-SAME: ) #[[ATTR1]] {
+; CHECK-SAME: ) #[[ATTR9]] {
 ; CHECK-NEXT:    call void asm sideeffect "
 ; CHECK-NEXT:    call void @use_most()
 ; CHECK-NEXT:    ret void
@@ -474,7 +474,7 @@ define amdgpu_kernel void @kernel_uses_asm_clobber_max() {
 
 define amdgpu_kernel void @kernel_uses_asm_physreg_oob() {
 ; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_asm_physreg_oob(
-; CHECK-SAME: ) #[[ATTR1]] {
+; CHECK-SAME: ) #[[ATTR9]] {
 ; CHECK-NEXT:    call void asm sideeffect "
 ; CHECK-NEXT:    call void @use_most()
 ; CHECK-NEXT:    ret void
@@ -486,7 +486,7 @@ define amdgpu_kernel void @kernel_uses_asm_physreg_oob() {
 
 define amdgpu_kernel void @kernel_uses_asm_virtreg_def_max_ty() {
 ; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_asm_virtreg_def_max_ty(
-; CHECK-SAME: ) #[[ATTR1]] {
+; CHECK-SAME: ) #[[ATTR10:[0-9]+]] {
 ; CHECK-NEXT:    [[DEF:%.*]] = call <32 x i32> asm sideeffect "
 ; CHECK-NEXT:    call void @use_most()
 ; CHECK-NEXT:    ret void
@@ -498,7 +498,7 @@ define amdgpu_kernel void @kernel_uses_asm_virtreg_def_max_ty() {
 
 define amdgpu_kernel void @kernel_uses_asm_virtreg_use_max_ty() {
 ; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_asm_virtreg_use_max_ty(
-; CHECK-SAME: ) #[[ATTR1]] {
+; CHECK-SAME: ) #[[ATTR10]] {
 ; CHECK-NEXT:    call void asm sideeffect "
 ; CHECK-NEXT:    call void @use_most()
 ; CHECK-NEXT:    ret void
@@ -510,7 +510,7 @@ define amdgpu_kernel void @kernel_uses_asm_virtreg_use_max_ty() {
 
 define amdgpu_kernel void @kernel_uses_asm_virtreg_use_def_max_ty() {
 ; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_asm_virtreg_use_def_max_ty(
-; CHECK-SAME: ) #[[ATTR1]] {
+; CHECK-SAME: ) #[[ATTR10]] {
 ; CHECK-NEXT:    [[DEF:%.*]] = call <32 x i32> asm sideeffect "
 ; CHECK-NEXT:    call void @use_most()
 ; CHECK-NEXT:    ret void
@@ -522,7 +522,7 @@ define amdgpu_kernel void @kernel_uses_asm_virtreg_use_def_max_ty() {
 
 define amdgpu_kernel void @vreg_use_exceeds_register_file() {
 ; CHECK-LABEL: define amdgpu_kernel void @vreg_use_exceeds_register_file(
-; CHECK-SAME: ) #[[ATTR1]] {
+; CHECK-SAME: ) #[[ATTR9]] {
 ; CHECK-NEXT:    call void asm sideeffect "
 ; CHECK-NEXT:    call void @use_most()
 ; CHECK-NEXT:    ret void
@@ -534,7 +534,7 @@ define amdgpu_kernel void @vreg_use_exceeds_register_file() {
 
 define amdgpu_kernel void @vreg_def_exceeds_register_file() {
 ; CHECK-LABEL: define amdgpu_kernel void @vreg_def_exceeds_register_file(
-; CHECK-SAME: ) #[[ATTR1]] {
+; CHECK-SAME: ) #[[ATTR9]] {
 ; CHECK-NEXT:    [[DEF:%.*]] = call <257 x i32> asm sideeffect "
 ; CHECK-NEXT:    call void @use_most()
 ; CHECK-NEXT:    ret void
@@ -546,7 +546,7 @@ define amdgpu_kernel void @vreg_def_exceeds_register_file() {
 
 define amdgpu_kernel void @multiple() {
 ; CHECK-LABEL: define amdgpu_kernel void @multiple(
-; CHECK-SAME: ) #[[ATTR1]] {
+; CHECK-SAME: ) #[[ATTR10]] {
 ; CHECK-NEXT:    [[DEF:%.*]] = call { <16 x i32>, <8 x i32>, <8 x i32> } asm sideeffect "
 ; CHECK-NEXT:    call void @use_most()
 ; CHECK-NEXT:    ret void
@@ -558,7 +558,7 @@ define amdgpu_kernel void @multiple() {
 
 define amdgpu_kernel void @earlyclobber_0() {
 ; CHECK-LABEL: define amdgpu_kernel void @earlyclobber_0(
-; CHECK-SAME: ) #[[ATTR1]] {
+; CHECK-SAME: ) #[[ATTR11:[0-9]+]] {
 ; CHECK-NEXT:    [[DEF:%.*]] = call <8 x i32> asm sideeffect "
 ; CHECK-NEXT:    call void @use_most()
 ; CHECK-NEXT:    ret void
@@ -570,7 +570,7 @@ define amdgpu_kernel void @earlyclobber_0() {
 
 define amdgpu_kernel void @earlyclobber_1() {
 ; CHECK-LABEL: define amdgpu_kernel void @earlyclobber_1(
-; CHECK-SAME: ) #[[ATTR1]] {
+; CHECK-SAME: ) #[[ATTR12:[0-9]+]] {
 ; CHECK-NEXT:    [[DEF:%.*]] = call { <8 x i32>, <16 x i32> } asm sideeffect "
 ; CHECK-NEXT:    call void @use_most()
 ; CHECK-NEXT:    ret void
@@ -582,7 +582,7 @@ define amdgpu_kernel void @earlyclobber_1() {
 
 define amdgpu_kernel void @physreg_a32__vreg_a256__vreg_a512() {
 ; CHECK-LABEL: define amdgpu_kernel void @physreg_a32__vreg_a256__vreg_a512(
-; CHECK-SAME: ) #[[ATTR1]] {
+; CHECK-SAME: ) #[[ATTR13:[0-9]+]] {
 ; CHECK-NEXT:    call void asm sideeffect "
 ; CHECK-NEXT:    call void @use_most()
 ; CHECK-NEXT:    ret void
@@ -594,7 +594,7 @@ define amdgpu_kernel void @physreg_a32__vreg_a256__vreg_a512() {
 
 define amdgpu_kernel void @physreg_def_a32__def_vreg_a256__def_vreg_a512() {
 ; CHECK-LABEL: define amdgpu_kernel void @physreg_def_a32__def_vreg_a256__def_vreg_a512(
-; CHECK-SAME: ) #[[ATTR1]] {
+; CHECK-SAME: ) #[[ATTR13]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = call { i32, <8 x i32>, <16 x i32> } asm sideeffect "
 ; CHECK-NEXT:    call void @use_most()
 ; CHECK-NEXT:    ret void
@@ -606,7 +606,7 @@ define amdgpu_kernel void @physreg_def_a32__def_vreg_a256__def_vreg_a512() {
 
 define amdgpu_kernel void @physreg_def_a32___def_vreg_a512_use_vreg_a256() {
 ; CHECK-LABEL: define amdgpu_kernel void @physreg_def_a32___def_vreg_a512_use_vreg_a256(
-; CHECK-SAME: ) #[[ATTR1]] {
+; CHECK-SAME: ) #[[ATTR14:[0-9]+]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = call { i32, <16 x i32> } asm sideeffect "
 ; CHECK-NEXT:    call void @use_most()
 ; CHECK-NEXT:    ret void
@@ -618,7 +618,7 @@ define amdgpu_kernel void @physreg_def_a32___def_vreg_a512_use_vreg_a256() {
 
 define amdgpu_kernel void @mixed_physreg_vreg_tuples_0() {
 ; CHECK-LABEL: define amdgpu_kernel void @mixed_physreg_vreg_tuples_0(
-; CHECK-SAME: ) #[[ATTR1]] {
+; CHECK-SAME: ) #[[ATTR11]] {
 ; CHECK-NEXT:    call void asm sideeffect "
 ; CHECK-NEXT:    call void @use_most()
 ; CHECK-NEXT:    ret void
@@ -630,7 +630,7 @@ define amdgpu_kernel void @mixed_physreg_vreg_tuples_0() {
 
 define amdgpu_kernel void @mixed_physreg_vreg_tuples_1() {
 ; CHECK-LABEL: define amdgpu_kernel void @mixed_physreg_vreg_tuples_1(
-; CHECK-SAME: ) #[[ATTR1]] {
+; CHECK-SAME: ) #[[ATTR15:[0-9]+]] {
 ; CHECK-NEXT:    call void asm sideeffect "
 ; CHECK-NEXT:    call void @use_most()
 ; CHECK-NEXT:    ret void
@@ -642,7 +642,7 @@ define amdgpu_kernel void @mixed_physreg_vreg_tuples_1() {
 
 define amdgpu_kernel void @physreg_raises_limit() {
 ; CHECK-LABEL: define amdgpu_kernel void @physreg_raises_limit(
-; CHECK-SAME: ) #[[ATTR1]] {
+; CHECK-SAME: ) #[[ATTR16:[0-9]+]] {
 ; CHECK-NEXT:    call void asm sideeffect "
 ; CHECK-NEXT:    call void @use_most()
 ; CHECK-NEXT:    ret void
@@ -652,10 +652,9 @@ define amdgpu_kernel void @physreg_raises_limit() {
   ret void
 }
 
-; FIXME: This should require 9. We cannot allocate an a128 at a0.
 define amdgpu_kernel void @physreg_tuple_alignment_raises_limit() {
 ; CHECK-LABEL: define amdgpu_kernel void @physreg_tuple_alignment_raises_limit(
-; CHECK-SAME: ) #[[ATTR1]] {
+; CHECK-SAME: ) #[[ATTR11]] {
 ; CHECK-NEXT:    call void asm sideeffect "
 ; CHECK-NEXT:    call void @use_most()
 ; CHECK-NEXT:    ret void
@@ -667,7 +666,7 @@ define amdgpu_kernel void @physreg_tuple_alignment_raises_limit() {
 
 define amdgpu_kernel void @align3_virtreg() {
 ; CHECK-LABEL: define amdgpu_kernel void @align3_virtreg(
-; CHECK-SAME: ) #[[ATTR1]] {
+; CHECK-SAME: ) #[[ATTR6]] {
 ; CHECK-NEXT:    call void asm sideeffect "
 ; CHECK-NEXT:    call void @use_most()
 ; CHECK-NEXT:    ret void
@@ -679,7 +678,7 @@ define amdgpu_kernel void @align3_virtreg() {
 
 define amdgpu_kernel void @align3_align4_virtreg() {
 ; CHECK-LABEL: define amdgpu_kernel void @align3_align4_virtreg(
-; CHECK-SAME: ) #[[ATTR1]] {
+; CHECK-SAME: ) #[[ATTR15]] {
 ; CHECK-NEXT:    call void asm sideeffect "
 ; CHECK-NEXT:    call void @use_most()
 ; CHECK-NEXT:    ret void
@@ -691,7 +690,7 @@ define amdgpu_kernel void @align3_align4_virtreg() {
 
 define amdgpu_kernel void @align2_align4_virtreg() {
 ; CHECK-LABEL: define amdgpu_kernel void @align2_align4_virtreg(
-; CHECK-SAME: ) #[[ATTR1]] {
+; CHECK-SAME: ) #[[ATTR15]] {
 ; CHECK-NEXT:    call void asm sideeffect "
 ; CHECK-NEXT:    call void @use_most()
 ; CHECK-NEXT:    ret void
@@ -703,7 +702,7 @@ define amdgpu_kernel void @align2_align4_virtreg() {
 
 define amdgpu_kernel void @kernel_uses_write_register_a55() {
 ; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_write_register_a55(
-; CHECK-SAME: ) #[[ATTR3:[0-9]+]] {
+; CHECK-SAME: ) #[[ATTR17:[0-9]+]] {
 ; CHECK-NEXT:    call void @llvm.write_register.i32(metadata [[META0:![0-9]+]], i32 0)
 ; CHECK-NEXT:    ret void
 ;
@@ -713,71 +712,313 @@ define amdgpu_kernel void @kernel_uses_write_register_a55() {
 
 define amdgpu_kernel void @kernel_uses_write_register_v55() {
 ; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_write_register_v55(
-; CHECK-SAME: ) #[[ATTR4:[0-9]+]] {
+; CHECK-SAME: ) #[[ATTR0]] {
 ; CHECK-NEXT:    call void @llvm.write_register.i32(metadata [[META1:![0-9]+]], i32 0)
+; CHECK-NEXT:    call void @use_most()
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.write_register.i64(metadata !1, i32 0)
+  call void @use_most()
   ret void
 }
 
 define amdgpu_kernel void @kernel_uses_write_register_a55_57() {
 ; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_write_register_a55_57(
-; CHECK-SAME: ) #[[ATTR3]] {
+; CHECK-SAME: ) #[[ATTR18:[0-9]+]] {
 ; CHECK-NEXT:    call void @llvm.write_register.i96(metadata [[META2:![0-9]+]], i96 0)
+; CHECK-NEXT:    call void @use_most()
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.write_register.i64(metadata !2, i96 0)
+  call void @use_most()
   ret void
 }
 
 define amdgpu_kernel void @kernel_uses_read_register_a55(ptr addrspace(1) %ptr) {
 ; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_read_register_a55(
-; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]]) #[[ATTR3]] {
+; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]]) #[[ATTR19:[0-9]+]] {
 ; CHECK-NEXT:    [[REG:%.*]] = call i32 @llvm.read_register.i32(metadata [[META0]])
 ; CHECK-NEXT:    store i32 [[REG]], ptr addrspace(1) [[PTR]], align 4
+; CHECK-NEXT:    call void @use_most()
 ; CHECK-NEXT:    ret void
 ;
   %reg = call i32 @llvm.read_register.i64(metadata !0)
   store i32 %reg, ptr addrspace(1) %ptr
+  call void @use_most()
   ret void
 }
 
 define amdgpu_kernel void @kernel_uses_read_volatile_register_a55(ptr addrspace(1) %ptr) {
 ; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_read_volatile_register_a55(
-; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]]) #[[ATTR3]] {
+; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]]) #[[ATTR19]] {
 ; CHECK-NEXT:    [[REG:%.*]] = call i32 @llvm.read_volatile_register.i32(metadata [[META0]])
 ; CHECK-NEXT:    store i32 [[REG]], ptr addrspace(1) [[PTR]], align 4
+; CHECK-NEXT:    call void @use_most()
 ; CHECK-NEXT:    ret void
 ;
   %reg = call i32 @llvm.read_volatile_register.i64(metadata !0)
   store i32 %reg, ptr addrspace(1) %ptr
+  call void @use_most()
   ret void
 }
 
 define amdgpu_kernel void @kernel_uses_read_register_a56_59(ptr addrspace(1) %ptr) {
 ; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_read_register_a56_59(
-; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]]) #[[ATTR3]] {
+; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]]) #[[ATTR20:[0-9]+]] {
 ; CHECK-NEXT:    [[REG:%.*]] = call i128 @llvm.read_register.i128(metadata [[META3:![0-9]+]])
 ; CHECK-NEXT:    store i128 [[REG]], ptr addrspace(1) [[PTR]], align 8
+; CHECK-NEXT:    call void @use_most()
 ; CHECK-NEXT:    ret void
 ;
   %reg = call i128 @llvm.read_register.i64(metadata !3)
   store i128 %reg, ptr addrspace(1) %ptr
+  call void @use_most()
   ret void
 }
 
 define amdgpu_kernel void @kernel_uses_write_register_out_of_bounds_a256() {
 ; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_write_register_out_of_bounds_a256(
-; CHECK-SAME: ) #[[ATTR3]] {
+; CHECK-SAME: ) #[[ATTR9]] {
 ; CHECK-NEXT:    call void @llvm.write_register.i32(metadata [[META4:![0-9]+]], i32 0)
+; CHECK-NEXT:    call void @use_most()
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.write_register.i64(metadata !4, i32 0)
+  call void @use_most()
+  ret void
+}
+
+define amdgpu_kernel void @kernel_multiple_uses() {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_multiple_uses(
+; CHECK-SAME: ) #[[ATTR5]] {
+; CHECK-NEXT:    call void asm sideeffect "
+; CHECK-NEXT:    call void asm sideeffect "
+; CHECK-NEXT:    call void asm sideeffect "
+; CHECK-NEXT:    call void @use_most()
+; CHECK-NEXT:    ret void
+;
+  call void asm sideeffect "; use $0", "a"(i64 poison)
+  call void asm sideeffect "; use $0", "a"(i32 poison)
+  call void asm sideeffect "; use $0", "a"(i128 poison)
+  call void @use_most()
+  ret void
+}
+
+define amdgpu_kernel void @kernel_multiple_defs() {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_multiple_defs(
+; CHECK-SAME: ) #[[ATTR5]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i64 asm sideeffect "
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 asm sideeffect "
+; CHECK-NEXT:    [[TMP3:%.*]] = call i128 asm sideeffect "
+; CHECK-NEXT:    call void @use_most()
+; CHECK-NEXT:    ret void
+;
+  call i64 asm sideeffect "; def $0", "=a"()
+  call i32 asm sideeffect "; def $0", "=a"()
+  call i128 asm sideeffect "; def $0", "=a"()
+  call void @use_most()
+  ret void
+}
+
+define amdgpu_kernel void @kernel_multiple_use_defs() {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_multiple_use_defs(
+; CHECK-SAME: ) #[[ATTR5]] {
+; CHECK-NEXT:    call void asm sideeffect "
+; CHECK-NEXT:    [[TMP1:%.*]] = call i128 asm sideeffect "
+; CHECK-NEXT:    call void @use_most()
+; CHECK-NEXT:    ret void
+;
+  call void asm sideeffect "; use $0", "a"(i32 poison)
+  call i128 asm sideeffect "; def $0", "=a"()
+  call void @use_most()
+  ret void
+}
+
+define void @callgraph_b() {
+; CHECK-LABEL: define void @callgraph_b(
+; CHECK-SAME: ) #[[ATTR15]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i32> asm sideeffect "
+; CHECK-NEXT:    call void asm sideeffect "
+; CHECK-NEXT:    call void @use_most()
+; CHECK-NEXT:    ret void
+;
+  call <4 x i32> asm sideeffect "; def $0", "=a"()
+  call void asm sideeffect "; use $0", "a"(<8 x i32> poison)
+  call void @use_most()
+  ret void
+}
+
+define void @callgraph_c() {
+; CHECK-LABEL: define void @callgraph_c(
+; CHECK-SAME: ) #[[ATTR2]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 asm sideeffect "
+; CHECK-NEXT:    call void asm sideeffect "
+; CHECK-NEXT:    call void @use_most()
+; CHECK-NEXT:    ret void
+;
+  call i32 asm sideeffect "; def $0", "=a"()
+  call void asm sideeffect "; use $0", "a"(<2 x i32> poison)
+  call void @use_most()
+  ret void
+}
+
+define void @callgraph_a(i1 %cond) {
+; CHECK-LABEL: define void @callgraph_a(
+; CHECK-SAME: i1 [[COND:%.*]]) #[[ATTR15]] {
+; CHECK-NEXT:    br i1 [[COND]], label [[A:%.*]], label [[B:%.*]]
+; CHECK:       a:
+; CHECK-NEXT:    call void @callgraph_b()
+; CHECK-NEXT:    ret void
+; CHECK:       b:
+; CHECK-NEXT:    call void @callgraph_c()
+; CHECK-NEXT:    ret void
+;
+  br i1 %cond, label %a, label %b
+
+a:
+  call void @callgraph_b()
+  ret void
+
+b:
+  call void @callgraph_c()
+  ret void
+}
+
+
+define void @kernel_max_callgraph(i1 %cond) {
+; CHECK-LABEL: define void @kernel_max_callgraph(
+; CHECK-SAME: i1 [[COND:%.*]]) #[[ATTR15]] {
+; CHECK-NEXT:    call void @callgraph_a(i1 [[COND]])
+; CHECK-NEXT:    ret void
+;
+  call void @callgraph_a(i1 %cond)
+  ret void
+}
+
+define amdgpu_kernel void @kernel_uses_all_virtregs() #1 {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_all_virtregs(
+; CHECK-SAME: ) #[[ATTR21:[0-9]+]] {
+; CHECK-NEXT:    call void asm sideeffect "
+; CHECK-NEXT:    call void @use_most()
+; CHECK-NEXT:    ret void
+;
+  call void asm sideeffect "; use $0", "a,a,a,a,a,a,a,a"(<32 x i32> poison, <32 x i32> poison, <32 x i32> poison, <32 x i32> poison, <32 x i32> poison, <32 x i32> poison, <32 x i32> poison, <32 x i32> poison)
+  call void @use_most()
+  ret void
+}
+
+define amdgpu_kernel void @kernel_uses_all_virtregs_plus_1() #1 {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_all_virtregs_plus_1(
+; CHECK-SAME: ) #[[ATTR21]] {
+; CHECK-NEXT:    call void asm sideeffect "
+; CHECK-NEXT:    call void @use_most()
+; CHECK-NEXT:    ret void
+;
+  call void asm sideeffect "; use $0", "a,a,a,a,a,a,a,a,a"(<32 x i32> poison, <32 x i32> poison, <32 x i32> poison, <32 x i32> poison, <32 x i32> poison, <32 x i32> poison, <32 x i32> poison, <32 x i32> poison, i32 poison)
+  call void @use_most()
+  ret void
+}
+
+define void @recursive() {
+; CHECK-LABEL: define void @recursive(
+; CHECK-SAME: ) #[[ATTR22:[0-9]+]] {
+; CHECK-NEXT:    call void asm sideeffect "
+; CHECK-NEXT:    call void @use_most()
+; CHECK-NEXT:    call void @recursive()
+; CHECK-NEXT:    ret void
+;
+  call void asm sideeffect "; use $0", "a"(<7 x i32> poison)
+  call void @use_most()
+  call void @recursive()
+  ret void
+}
+
+define void @indirect_0() {
+; CHECK-LABEL: define void @indirect_0(
+; CHECK-SAME: ) #[[ATTR22]] {
+; CHECK-NEXT:    call void asm sideeffect "
+; CHECK-NEXT:    call void @use_most()
+; CHECK-NEXT:    ret void
+;
+  call void asm sideeffect "; use $0", "a"(<7 x i32> poison)
+  call void @use_most()
+  ret void
+}
+
+define void @indirect_1() {
+; CHECK-LABEL: define void @indirect_1(
+; CHECK-SAME: ) #[[ATTR23:[0-9]+]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = call <3 x i32> asm sideeffect "
+; CHECK-NEXT:    call void @use_most()
+; CHECK-NEXT:    ret void
+;
+  call <3 x i32> asm sideeffect "; def $0", "=a"()
+  call void @use_most()
+  ret void
+}
+
+define amdgpu_kernel void @knowable_indirect_call(i1 %cond) {
+; CHECK-LABEL: define amdgpu_kernel void @knowable_indirect_call(
+; CHECK-SAME: i1 [[COND:%.*]]) #[[ATTR22]] {
+; CHECK-NEXT:    [[FPTR:%.*]] = select i1 [[COND]], ptr @indirect_0, ptr @indirect_1
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq ptr [[FPTR]], @indirect_1
+; CHECK-NEXT:    br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP3:%.*]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @indirect_1()
+; CHECK-NEXT:    br label [[TMP6:%.*]]
+; CHECK:       3:
+; CHECK-NEXT:    br i1 true, label [[TMP4:%.*]], label [[TMP5:%.*]]
+; CHECK:       4:
+; CHECK-NEXT:    call void @indirect_0()
+; CHECK-NEXT:    br label [[TMP6]]
+; CHECK:       5:
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    call void @use_most()
+; CHECK-NEXT:    ret void
+;
+  %fptr = select i1 %cond, ptr @indirect_0, ptr @indirect_1
+  call void %fptr()
+  call void @use_most()
+  ret void
+}
+
+define amdgpu_kernel void @calls_poison(i1 %cond) {
+; CHECK-LABEL: define amdgpu_kernel void @calls_poison(
+; CHECK-SAME: i1 [[COND:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    call void poison()
+; CHECK-NEXT:    call void @use_most()
+; CHECK-NEXT:    ret void
+;
+  call void poison()
+  call void @use_most()
+  ret void
+}
+
+define amdgpu_kernel void @calls_null(i1 %cond) {
+; CHECK-LABEL: define amdgpu_kernel void @calls_null(
+; CHECK-SAME: i1 [[COND:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    call void null()
+; CHECK-NEXT:    call void @use_most()
+; CHECK-NEXT:    ret void
+;
+  call void null()
+  call void @use_most()
+  ret void
+}
+
+define amdgpu_kernel void @indirect_unknown(ptr %fptr) {
+; CHECK-LABEL: define amdgpu_kernel void @indirect_unknown(
+; CHECK-SAME: ptr [[FPTR:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    call void [[FPTR]]()
+; CHECK-NEXT:    ret void
+;
+  call void %fptr()
   ret void
 }
 
 attributes #0 = { "amdgpu-agpr-alloc"="0" }
+attributes #1 = { "amdgpu-waves-per-eu"="1,1" }
 
 !0 = !{!"a55"}
 !1 = !{!"v55"}
@@ -787,16 +1028,35 @@ attributes #0 = { "amdgpu-agpr-alloc"="0" }
 
 ;.
 ; CHECK: attributes #[[ATTR0]] = { "amdgpu-agpr-alloc"="0" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR1]] = { "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR2:[0-9]+]] = { convergent nocallback nofree nosync nounwind willreturn memory(none) "target-cpu"="gfx90a" }
-; CHECK: attributes #[[ATTR3]] = { "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR4]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR5:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) "target-cpu"="gfx90a" }
-; CHECK: attributes #[[ATTR6:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) "target-cpu"="gfx90a" }
-; CHECK: attributes #[[ATTR7:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(read) "target-cpu"="gfx90a" }
-; CHECK: attributes #[[ATTR8:[0-9]+]] = { nounwind "target-cpu"="gfx90a" }
-; CHECK: attributes #[[ATTR9:[0-9]+]] = { nocallback nounwind "target-cpu"="gfx90a" }
-; CHECK: attributes #[[ATTR10]] = { "amdgpu-agpr-alloc"="0" }
+; CHECK: attributes #[[ATTR1]] = { "amdgpu-agpr-alloc"="1" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR2]] = { "amdgpu-agpr-alloc"="2" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR3]] = { "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR4:[0-9]+]] = { convergent nocallback nofree nosync nounwind willreturn memory(none) "target-cpu"="gfx90a" }
+; CHECK: attributes #[[ATTR5]] = { "amdgpu-agpr-alloc"="4" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR6]] = { "amdgpu-agpr-alloc"="6" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR7]] = { "amdgpu-agpr-alloc"="5" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR8]] = { "amdgpu-agpr-alloc"="14" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR9]] = { "amdgpu-agpr-alloc"="256" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR10]] = { "amdgpu-agpr-alloc"="32" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR11]] = { "amdgpu-agpr-alloc"="9" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR12]] = { "amdgpu-agpr-alloc"="64" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR13]] = { "amdgpu-agpr-alloc"="49" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR14]] = { "amdgpu-agpr-alloc"="33" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR15]] = { "amdgpu-agpr-alloc"="8" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR16]] = { "amdgpu-agpr-alloc"="13" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR17]] = { "amdgpu-agpr-alloc"="56" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR18]] = { "amdgpu-agpr-alloc"="58" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR19]] = { "amdgpu-agpr-alloc"="56" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR20]] = { "amdgpu-agpr-alloc"="60" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR21]] = { "amdgpu-agpr-alloc"="256" "amdgpu-waves-per-eu"="1,1" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR22]] = { "amdgpu-agpr-alloc"="7" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR23]] = { "amdgpu-agpr-alloc"="3" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR24:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) "target-cpu"="gfx90a" }
+; CHECK: attributes #[[ATTR25:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) "target-cpu"="gfx90a" }
+; CHECK: attributes #[[ATTR26:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(read) "target-cpu"="gfx90a" }
+; CHECK: attributes #[[ATTR27:[0-9]+]] = { nounwind "target-cpu"="gfx90a" }
+; CHECK: attributes #[[ATTR28:[0-9]+]] = { nocallback nounwind "target-cpu"="gfx90a" }
+; CHECK: attributes #[[ATTR29]] = { "amdgpu-agpr-alloc"="0" }
 ;.
 ; CHECK: [[META0]] = !{!"a55"}
 ; CHECK: [[META1]] = !{!"v55"}
diff --git a/llvm/test/CodeGen/AMDGPU/mad_int24.ll b/llvm/test/CodeGen/AMDGPU/mad_int24.ll
index 93fda94..dd88310 100644
--- a/llvm/test/CodeGen/AMDGPU/mad_int24.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad_int24.ll
@@ -1,17 +1,79 @@
-; RUN: llc < %s -mtriple=amdgcn | FileCheck %s --check-prefix=GCN --check-prefix=FUNC
-; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global | FileCheck %s --check-prefix=GCN --check-prefix=FUNC
-; RUN: llc < %s -mtriple=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG --check-prefix=FUNC
-; RUN: llc < %s -mtriple=r600 -mcpu=cayman | FileCheck %s --check-prefix=CM --check-prefix=FUNC
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc < %s -mtriple=amdgcn| FileCheck %s --check-prefixes=GCN
+; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global | FileCheck %s --check-prefixes=VI
+; RUN: llc < %s -mtriple=r600 -mcpu=redwood | FileCheck %s --check-prefixes=EG,R600,RW
+; RUN: llc < %s -mtriple=r600 -mcpu=cayman | FileCheck %s --check-prefixes=EG,R600,CM
 
-; FUNC-LABEL: {{^}}i32_mad24:
 ; Signed 24-bit multiply is not supported on pre-Cayman GPUs.
-; EG: MULLO_INT
-; CM: MULLO_INT
-; GCN: s_bfe_i32
-; GCN: s_bfe_i32
-; GCN: s_mul_i32
-; GCN: s_add_i32
 define amdgpu_kernel void @i32_mad24(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) {
+; GCN-LABEL: i32_mad24:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0xb
+; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_bfe_i32 s0, s0, 0x180000
+; GCN-NEXT:    s_bfe_i32 s1, s1, 0x180000
+; GCN-NEXT:    s_mul_i32 s0, s0, s1
+; GCN-NEXT:    s_add_i32 s0, s0, s2
+; GCN-NEXT:    s_mov_b32 s6, -1
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GCN-NEXT:    s_endpgm
+;
+; VI-LABEL: i32_mad24:
+; VI:       ; %bb.0: ; %entry
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x24
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_bfe_i32 s0, s0, 0x180000
+; VI-NEXT:    s_bfe_i32 s1, s1, 0x180000
+; VI-NEXT:    s_mul_i32 s0, s0, s1
+; VI-NEXT:    s_add_i32 s0, s0, s2
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT:    s_endpgm
+;
+; RW-LABEL: i32_mad24:
+; RW:       ; %bb.0: ; %entry
+; RW-NEXT:    ALU 9, @4, KC0[CB0:0-32], KC1[]
+; RW-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
+; RW-NEXT:    CF_END
+; RW-NEXT:    PAD
+; RW-NEXT:    ALU clause starting at 4:
+; RW-NEXT:     LSHL T0.W, KC0[2].Z, literal.x,
+; RW-NEXT:     LSHL * T1.W, KC0[2].W, literal.x,
+; RW-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; RW-NEXT:     ASHR T1.W, PS, literal.x,
+; RW-NEXT:     ASHR * T0.W, PV.W, literal.x,
+; RW-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; RW-NEXT:     MULLO_INT * T0.X, PS, PV.W,
+; RW-NEXT:     ADD_INT T0.X, PS, KC0[3].X,
+; RW-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
+; RW-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+;
+; CM-LABEL: i32_mad24:
+; CM:       ; %bb.0: ; %entry
+; CM-NEXT:    ALU 12, @4, KC0[CB0:0-32], KC1[]
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X
+; CM-NEXT:    CF_END
+; CM-NEXT:    PAD
+; CM-NEXT:    ALU clause starting at 4:
+; CM-NEXT:     LSHL T0.Z, KC0[2].Z, literal.x,
+; CM-NEXT:     LSHL * T0.W, KC0[2].W, literal.x,
+; CM-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; CM-NEXT:     ASHR T1.Z, PV.W, literal.x,
+; CM-NEXT:     ASHR * T0.W, PV.Z, literal.x,
+; CM-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; CM-NEXT:     MULLO_INT T0.X, T0.W, T1.Z,
+; CM-NEXT:     MULLO_INT T0.Y (MASKED), T0.W, T1.Z,
+; CM-NEXT:     MULLO_INT T0.Z (MASKED), T0.W, T1.Z,
+; CM-NEXT:     MULLO_INT * T0.W (MASKED), T0.W, T1.Z,
+; CM-NEXT:     ADD_INT * T0.X, PV.X, KC0[3].X,
+; CM-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
+; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
 entry:
   %0 = shl i32 %a, 8
   %a_24 = ashr i32 %0, 8
@@ -23,13 +85,25 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}mad24_known_bits_destroyed:
-; GCN: s_waitcnt
-; GCN-NEXT: v_mad_i32_i24
-; GCN-NEXT: v_mul_i32_i24
-; GCN-NEXT: s_setpc_b64
 define i32 @mad24_known_bits_destroyed(i32 %a, i32 %b, i32 %c) {
-
+; GCN-LABEL: mad24_known_bits_destroyed:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_mad_i32_i24 v1, v0, v1, v2
+; GCN-NEXT:    v_mul_i32_i24_e32 v0, v1, v0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: mad24_known_bits_destroyed:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_mad_i32_i24 v1, v0, v1, v2
+; VI-NEXT:    v_mul_i32_i24_e32 v0, v1, v0
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; EG-LABEL: mad24_known_bits_destroyed:
+; EG:       ; %bb.0:
+; EG-NEXT:    CF_END
+; EG-NEXT:    PAD
   %shl.0 = shl i32 %a, 8
   %sra.0 = ashr i32 %shl.0, 8
   %shl.1 = shl i32 %b, 8
@@ -48,12 +122,25 @@ define i32 @mad24_known_bits_destroyed(i32 %a, i32 %b, i32 %c) {
   ret i32 %mul1
 }
 
-; GCN-LABEL: {{^}}mad24_intrin_known_bits_destroyed:
-; GCN: s_waitcnt
-; GCN-NEXT: v_mad_i32_i24
-; GCN-NEXT: v_mul_i32_i24
-; GCN-NEXT: s_setpc_b64
 define i32 @mad24_intrin_known_bits_destroyed(i32 %a, i32 %b, i32 %c) {
+; GCN-LABEL: mad24_intrin_known_bits_destroyed:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_mad_i32_i24 v1, v0, v1, v2
+; GCN-NEXT:    v_mul_i32_i24_e32 v0, v1, v0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: mad24_intrin_known_bits_destroyed:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_mad_i32_i24 v1, v0, v1, v2
+; VI-NEXT:    v_mul_i32_i24_e32 v0, v1, v0
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; EG-LABEL: mad24_intrin_known_bits_destroyed:
+; EG:       ; %bb.0:
+; EG-NEXT:    CF_END
+; EG-NEXT:    PAD
   %shl.0 = shl i32 %a, 8
   %sra.0 = ashr i32 %shl.0, 8
   %shl.1 = shl i32 %b, 8
@@ -73,17 +160,177 @@ define i32 @mad24_intrin_known_bits_destroyed(i32 %a, i32 %b, i32 %c) {
 }
 
 ; Make sure no unnecessary BFEs are emitted in the loop.
-; GCN-LABEL: {{^}}mad24_destroyed_knownbits_2:
-; GCN-NOT: v_bfe
-; GCN: v_mad_i32_i24
-; GCN-NOT: v_bfe
-; GCN: v_mad_i32_i24
-; GCN-NOT: v_bfe
-; GCN: v_mad_i32_i24
-; GCN-NOT: v_bfe
-; GCN: v_mad_i32_i24
-; GCN-NOT: v_bfe
 define void @mad24_destroyed_knownbits_2(i32 %arg, i32 %arg1, i32 %arg2, ptr addrspace(1) %arg3) {
+; GCN-LABEL: mad24_destroyed_knownbits_2:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v5, 1
+; GCN-NEXT:    s_mov_b64 s[4:5], 0
+; GCN-NEXT:  .LBB3_1: ; %bb6
+; GCN-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GCN-NEXT:    v_mad_i32_i24 v0, v0, v5, v5
+; GCN-NEXT:    v_add_i32_e32 v1, vcc, -1, v1
+; GCN-NEXT:    v_mad_i32_i24 v5, v0, v5, v0
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; GCN-NEXT:    v_mad_i32_i24 v0, v5, v0, v5
+; GCN-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GCN-NEXT:    v_mad_i32_i24 v0, v0, v5, v0
+; GCN-NEXT:    v_mov_b32_e32 v5, v2
+; GCN-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GCN-NEXT:    s_cbranch_execnz .LBB3_1
+; GCN-NEXT:  ; %bb.2: ; %bb5
+; GCN-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s4, s6
+; GCN-NEXT:    s_mov_b32 s5, s6
+; GCN-NEXT:    buffer_store_dword v0, v[3:4], s[4:7], 0 addr64
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: mad24_destroyed_knownbits_2:
+; VI:       ; %bb.0: ; %bb
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v5, 1
+; VI-NEXT:    s_mov_b64 s[4:5], 0
+; VI-NEXT:  .LBB3_1: ; %bb6
+; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; VI-NEXT:    v_mad_i32_i24 v0, v0, v5, v5
+; VI-NEXT:    v_mad_i32_i24 v5, v0, v5, v0
+; VI-NEXT:    v_add_u32_e32 v1, vcc, -1, v1
+; VI-NEXT:    v_mad_i32_i24 v0, v5, v0, v5
+; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; VI-NEXT:    v_mad_i32_i24 v0, v0, v5, v0
+; VI-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; VI-NEXT:    v_mov_b32_e32 v5, v2
+; VI-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; VI-NEXT:    s_cbranch_execnz .LBB3_1
+; VI-NEXT:  ; %bb.2: ; %bb5
+; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
+; VI-NEXT:    flat_store_dword v[3:4], v0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; RW-LABEL: mad24_destroyed_knownbits_2:
+; RW:       ; %bb.0: ; %bb
+; RW-NEXT:    ALU 5, @10, KC0[CB0:0-32], KC1[]
+; RW-NEXT:    LOOP_START_DX10 @7
+; RW-NEXT:    ALU_PUSH_BEFORE 30, @16, KC0[], KC1[]
+; RW-NEXT:    JUMP @6 POP:1
+; RW-NEXT:    LOOP_BREAK @6
+; RW-NEXT:    POP @6 POP:1
+; RW-NEXT:    END_LOOP @2
+; RW-NEXT:    ALU 1, @47, KC0[], KC1[]
+; RW-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
+; RW-NEXT:    CF_END
+; RW-NEXT:    ALU clause starting at 10:
+; RW-NEXT:     MOV T0.X, KC0[2].Y,
+; RW-NEXT:     MOV T0.Y, KC0[2].Z,
+; RW-NEXT:     MOV * T0.Z, KC0[2].W,
+; RW-NEXT:     MOV T0.W, KC0[3].X,
+; RW-NEXT:     MOV * T1.W, literal.x,
+; RW-NEXT:    1(1.401298e-45), 0(0.000000e+00)
+; RW-NEXT:    ALU clause starting at 16:
+; RW-NEXT:     LSHL T2.W, T1.W, literal.x,
+; RW-NEXT:     LSHL * T3.W, T0.X, literal.x,
+; RW-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; RW-NEXT:     ASHR T3.W, PS, literal.x,
+; RW-NEXT:     ASHR * T2.W, PV.W, literal.x,
+; RW-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; RW-NEXT:     MULLO_INT * T0.X, PV.W, PS,
+; RW-NEXT:     ADD_INT * T1.W, PS, T1.W,
+; RW-NEXT:     LSHL * T3.W, PV.W, literal.x,
+; RW-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; RW-NEXT:     ASHR * T3.W, PV.W, literal.x,
+; RW-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; RW-NEXT:     MULLO_INT * T0.X, PV.W, T2.W,
+; RW-NEXT:     ADD_INT * T1.W, PS, T1.W,
+; RW-NEXT:     LSHL * T2.W, PV.W, literal.x,
+; RW-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; RW-NEXT:     ASHR * T2.W, PV.W, literal.x,
+; RW-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; RW-NEXT:     MULLO_INT * T0.X, PV.W, T3.W,
+; RW-NEXT:     ADD_INT * T1.W, PS, T1.W,
+; RW-NEXT:     LSHL * T3.W, PV.W, literal.x,
+; RW-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; RW-NEXT:     ASHR * T3.W, PV.W, literal.x,
+; RW-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; RW-NEXT:     ADD_INT T0.Y, T0.Y, literal.x,
+; RW-NEXT:     MULLO_INT * T0.X, PV.W, T2.W,
+; RW-NEXT:    -1(nan), 0(0.000000e+00)
+; RW-NEXT:     ADD_INT T0.X, PS, T1.W,
+; RW-NEXT:     SETE_INT T2.W, PV.Y, 0.0,
+; RW-NEXT:     MOV * T1.W, T0.Z,
+; RW-NEXT:     PRED_SETNE_INT * ExecMask,PredicateBit (MASKED), PV.W, 0.0,
+; RW-NEXT:    ALU clause starting at 47:
+; RW-NEXT:     LSHR * T1.X, T0.W, literal.x,
+; RW-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+;
+; CM-LABEL: mad24_destroyed_knownbits_2:
+; CM:       ; %bb.0: ; %bb
+; CM-NEXT:    ALU 5, @10, KC0[CB0:0-32], KC1[]
+; CM-NEXT:    LOOP_START_DX10 @7
+; CM-NEXT:    ALU_PUSH_BEFORE 41, @16, KC0[], KC1[]
+; CM-NEXT:    JUMP @6 POP:1
+; CM-NEXT:    LOOP_BREAK @6
+; CM-NEXT:    POP @6 POP:1
+; CM-NEXT:    END_LOOP @2
+; CM-NEXT:    ALU 1, @58, KC0[], KC1[]
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X
+; CM-NEXT:    CF_END
+; CM-NEXT:    ALU clause starting at 10:
+; CM-NEXT:     MOV * T1.X, KC0[2].Y,
+; CM-NEXT:     MOV T0.X, KC0[2].Z,
+; CM-NEXT:     MOV T0.Y, KC0[2].W,
+; CM-NEXT:     MOV T0.Z, KC0[3].X,
+; CM-NEXT:     MOV * T0.W, literal.x,
+; CM-NEXT:    1(1.401298e-45), 0(0.000000e+00)
+; CM-NEXT:    ALU clause starting at 16:
+; CM-NEXT:     LSHL T1.Z, T0.W, literal.x,
+; CM-NEXT:     LSHL * T1.W, T1.X, literal.x,
+; CM-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; CM-NEXT:     ASHR T2.Z, PV.W, literal.x,
+; CM-NEXT:     ASHR * T1.W, PV.Z, literal.x,
+; CM-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; CM-NEXT:     MULLO_INT T1.X, T2.Z, T1.W,
+; CM-NEXT:     MULLO_INT T1.Y (MASKED), T2.Z, T1.W,
+; CM-NEXT:     MULLO_INT T1.Z (MASKED), T2.Z, T1.W,
+; CM-NEXT:     MULLO_INT * T1.W (MASKED), T2.Z, T1.W,
+; CM-NEXT:     ADD_INT * T0.W, PV.X, T0.W,
+; CM-NEXT:     LSHL * T2.W, PV.W, literal.x,
+; CM-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; CM-NEXT:     ASHR * T2.W, PV.W, literal.x,
+; CM-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; CM-NEXT:     MULLO_INT T1.X, T2.W, T1.W,
+; CM-NEXT:     MULLO_INT T1.Y (MASKED), T2.W, T1.W,
+; CM-NEXT:     MULLO_INT T1.Z (MASKED), T2.W, T1.W,
+; CM-NEXT:     MULLO_INT * T1.W (MASKED), T2.W, T1.W,
+; CM-NEXT:     ADD_INT * T0.W, PV.X, T0.W,
+; CM-NEXT:     LSHL * T1.W, PV.W, literal.x,
+; CM-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; CM-NEXT:     ASHR * T1.W, PV.W, literal.x,
+; CM-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; CM-NEXT:     MULLO_INT T1.X, T1.W, T2.W,
+; CM-NEXT:     MULLO_INT T1.Y (MASKED), T1.W, T2.W,
+; CM-NEXT:     MULLO_INT T1.Z (MASKED), T1.W, T2.W,
+; CM-NEXT:     MULLO_INT * T1.W (MASKED), T1.W, T2.W,
+; CM-NEXT:     ADD_INT * T0.W, PV.X, T0.W,
+; CM-NEXT:     LSHL * T2.W, PV.W, literal.x,
+; CM-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT T0.X, T0.X, literal.x,
+; CM-NEXT:     ASHR * T2.W, PV.W, literal.y,
+; CM-NEXT:    -1(nan), 8(1.121039e-44)
+; CM-NEXT:     MULLO_INT T1.X, T2.W, T1.W,
+; CM-NEXT:     MULLO_INT T1.Y (MASKED), T2.W, T1.W,
+; CM-NEXT:     MULLO_INT T1.Z (MASKED), T2.W, T1.W,
+; CM-NEXT:     MULLO_INT * T1.W (MASKED), T2.W, T1.W,
+; CM-NEXT:     ADD_INT T1.X, PV.X, T0.W,
+; CM-NEXT:     SETE_INT T1.Z, T0.X, 0.0,
+; CM-NEXT:     MOV * T0.W, T0.Y,
+; CM-NEXT:     PRED_SETNE_INT * ExecMask,PredicateBit (MASKED), PV.Z, 0.0,
+; CM-NEXT:    ALU clause starting at 58:
+; CM-NEXT:     LSHR * T0.X, T0.Z, literal.x,
+; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
 bb:
   br label %bb6
 
@@ -119,3 +366,5 @@ bb6:                                              ; preds = %bb6, %bb
 }
 
 declare i32 @llvm.amdgcn.mul.i24(i32, i32)
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; R600: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/mad_uint24.ll b/llvm/test/CodeGen/AMDGPU/mad_uint24.ll
index a6d458e..46b8df4 100644
--- a/llvm/test/CodeGen/AMDGPU/mad_uint24.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad_uint24.ll
@@ -1,19 +1,75 @@
-; RUN: llc < %s -mtriple=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG --check-prefix=FUNC
-; RUN: llc < %s -mtriple=r600 -mcpu=cayman | FileCheck %s --check-prefix=EG --check-prefix=FUNC
-; RUN: llc < %s -mtriple=amdgcn | FileCheck %s --check-prefix=SI --check-prefix=FUNC --check-prefix=GCN
-; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global | FileCheck %s --check-prefix=VI --check-prefix=FUNC --check-prefix=GCN --check-prefix=GCN2
-; RUN: llc < %s -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global | FileCheck %s --check-prefix=VI --check-prefix=FUNC --check-prefix=GCN --check-prefix=GCN2
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc < %s -mtriple=r600 -mcpu=redwood | FileCheck %s --check-prefixes=EG
+; RUN: llc < %s -mtriple=r600 -mcpu=cayman | FileCheck %s --check-prefixes=CM
+; RUN: llc < %s -mtriple=amdgcn | FileCheck %s --check-prefixes=GCN
+; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global | FileCheck %s --check-prefixes=GFX8,SI
+; RUN: llc < %s -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global | FileCheck %s --check-prefixes=GFX8,VI
 
 declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
 
-; FUNC-LABEL: {{^}}u32_mad24:
-; EG: MULLO_INT
-; SI: s_mul_i32
-; SI: s_add_i32
-; VI: s_mul_{{[iu]}}32
-; VI: s_add_{{[iu]}}32
-
 define amdgpu_kernel void @u32_mad24(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) {
+; EG-LABEL: u32_mad24:
+; EG:       ; %bb.0: ; %entry
+; EG-NEXT:    ALU 6, @4, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
+; EG-NEXT:    CF_END
+; EG-NEXT:    PAD
+; EG-NEXT:    ALU clause starting at 4:
+; EG-NEXT:     AND_INT T0.W, KC0[2].W, literal.x,
+; EG-NEXT:     AND_INT * T1.W, KC0[2].Z, literal.x,
+; EG-NEXT:    16777215(2.350989e-38), 0(0.000000e+00)
+; EG-NEXT:     MULLO_INT * T0.X, PS, PV.W,
+; EG-NEXT:     ADD_INT T0.X, PS, KC0[3].X,
+; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
+; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+;
+; CM-LABEL: u32_mad24:
+; CM:       ; %bb.0: ; %entry
+; CM-NEXT:    ALU 9, @4, KC0[CB0:0-32], KC1[]
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X
+; CM-NEXT:    CF_END
+; CM-NEXT:    PAD
+; CM-NEXT:    ALU clause starting at 4:
+; CM-NEXT:     AND_INT T0.Z, KC0[2].W, literal.x,
+; CM-NEXT:     AND_INT * T0.W, KC0[2].Z, literal.x,
+; CM-NEXT:    16777215(2.350989e-38), 0(0.000000e+00)
+; CM-NEXT:     MULLO_INT T0.X, T0.W, T0.Z,
+; CM-NEXT:     MULLO_INT T0.Y (MASKED), T0.W, T0.Z,
+; CM-NEXT:     MULLO_INT T0.Z (MASKED), T0.W, T0.Z,
+; CM-NEXT:     MULLO_INT * T0.W (MASKED), T0.W, T0.Z,
+; CM-NEXT:     ADD_INT * T0.X, PV.X, KC0[3].X,
+; CM-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
+; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+;
+; GCN-LABEL: u32_mad24:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0xb
+; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_and_b32 s0, s0, 0xffffff
+; GCN-NEXT:    s_and_b32 s1, s1, 0xffffff
+; GCN-NEXT:    s_mul_i32 s0, s0, s1
+; GCN-NEXT:    s_add_i32 s0, s0, s2
+; GCN-NEXT:    s_mov_b32 s6, -1
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GCN-NEXT:    s_endpgm
+;
+; GFX8-LABEL: u32_mad24:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x24
+; GFX8-NEXT:    s_mov_b32 s7, 0xf000
+; GFX8-NEXT:    s_mov_b32 s6, -1
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_and_b32 s0, s0, 0xffffff
+; GFX8-NEXT:    s_and_b32 s1, s1, 0xffffff
+; GFX8-NEXT:    s_mul_i32 s0, s0, s1
+; GFX8-NEXT:    s_add_i32 s0, s0, s2
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX8-NEXT:    s_endpgm
 entry:
   %0 = shl i32 %a, 8
   %a_24 = lshr i32 %0, 8
@@ -25,18 +81,88 @@ entry:
   ret void
 }
 
-; FUNC-LABEL: {{^}}i16_mad24:
 ; The order of A and B does not matter.
-; EG: MULLO_INT {{[* ]*}}T{{[0-9]}}.[[MAD_CHAN:[XYZW]]]
-; EG: ADD_INT {{[* ]*}}T{{[0-9]}}.[[MAD_CHAN:[XYZW]]]
 ; The result must be sign-extended
-; EG: BFE_INT {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[MAD_CHAN]], 0.0, literal.x
-; EG: 16
-; GCN:	s_mul_i32 [[MUL:s[0-9]]], {{[s][0-9], [s][0-9]}}
-; GCN:	s_add_i32 [[MAD:s[0-9]]], [[MUL]], s{{[0-9]}}
-; GCN:	s_sext_i32_i16 [[EXT:s[0-9]]], [[MAD]]
-; GCN:	v_mov_b32_e32 v0, [[EXT]]
 define amdgpu_kernel void @i16_mad24(ptr addrspace(1) %out, i16 %a, i16 %b, i16 %c) {
+; EG-LABEL: i16_mad24:
+; EG:       ; %bb.0: ; %entry
+; EG-NEXT:    ALU 0, @12, KC0[], KC1[]
+; EG-NEXT:    TEX 2 @6
+; EG-NEXT:    ALU 4, @13, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
+; EG-NEXT:    CF_END
+; EG-NEXT:    PAD
+; EG-NEXT:    Fetch clause starting at 6:
+; EG-NEXT:     VTX_READ_16 T1.X, T0.X, 40, #3
+; EG-NEXT:     VTX_READ_16 T2.X, T0.X, 42, #3
+; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 44, #3
+; EG-NEXT:    ALU clause starting at 12:
+; EG-NEXT:     MOV * T0.X, 0.0,
+; EG-NEXT:    ALU clause starting at 13:
+; EG-NEXT:     MULLO_INT * T0.Y, T1.X, T2.X,
+; EG-NEXT:     ADD_INT * T0.W, PS, T0.X,
+; EG-NEXT:     BFE_INT T0.X, PV.W, 0.0, literal.x,
+; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 2(2.802597e-45)
+;
+; CM-LABEL: i16_mad24:
+; CM:       ; %bb.0: ; %entry
+; CM-NEXT:    ALU 0, @12, KC0[], KC1[]
+; CM-NEXT:    TEX 2 @6
+; CM-NEXT:    ALU 8, @13, KC0[CB0:0-32], KC1[]
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X
+; CM-NEXT:    CF_END
+; CM-NEXT:    PAD
+; CM-NEXT:    Fetch clause starting at 6:
+; CM-NEXT:     VTX_READ_16 T1.X, T0.X, 40, #3
+; CM-NEXT:     VTX_READ_16 T2.X, T0.X, 42, #3
+; CM-NEXT:     VTX_READ_16 T0.X, T0.X, 44, #3
+; CM-NEXT:    ALU clause starting at 12:
+; CM-NEXT:     MOV * T0.X, 0.0,
+; CM-NEXT:    ALU clause starting at 13:
+; CM-NEXT:     MULLO_INT T0.X (MASKED), T1.X, T2.X,
+; CM-NEXT:     MULLO_INT T0.Y, T1.X, T2.X,
+; CM-NEXT:     MULLO_INT T0.Z (MASKED), T1.X, T2.X,
+; CM-NEXT:     MULLO_INT * T0.W (MASKED), T1.X, T2.X,
+; CM-NEXT:     ADD_INT * T0.W, PV.Y, T0.X,
+; CM-NEXT:     BFE_INT * T0.X, PV.W, 0.0, literal.x,
+; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; CM-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
+; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+;
+; GCN-LABEL: i16_mad24:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT:    s_load_dword s4, s[4:5], 0xb
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_lshr_b32 s2, s2, 16
+; GCN-NEXT:    s_mul_i32 s2, s4, s2
+; GCN-NEXT:    s_add_i32 s2, s2, s3
+; GCN-NEXT:    s_sext_i32_i16 s2, s2
+; GCN-NEXT:    s_mov_b32 s6, -1
+; GCN-NEXT:    s_mov_b32 s4, s0
+; GCN-NEXT:    s_mov_b32 s5, s1
+; GCN-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GCN-NEXT:    s_endpgm
+;
+; GFX8-LABEL: i16_mad24:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX8-NEXT:    s_load_dword s8, s[4:5], 0x2c
+; GFX8-NEXT:    s_mov_b32 s7, 0xf000
+; GFX8-NEXT:    s_mov_b32 s6, -1
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_mov_b32 s4, s0
+; GFX8-NEXT:    s_lshr_b32 s0, s2, 16
+; GFX8-NEXT:    s_mul_i32 s0, s8, s0
+; GFX8-NEXT:    s_add_i32 s0, s0, s3
+; GFX8-NEXT:    s_sext_i32_i16 s0, s0
+; GFX8-NEXT:    s_mov_b32 s5, s1
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX8-NEXT:    s_endpgm
 entry:
   %0 = mul i16 %a, %b
   %1 = add i16 %0, %c
@@ -46,17 +172,85 @@ entry:
 }
 
 ; FIXME: Need to handle non-uniform case for function below (load without gep).
-; FUNC-LABEL: {{^}}i8_mad24:
-; EG: MULLO_INT {{[* ]*}}T{{[0-9]}}.[[MAD_CHAN:[XYZW]]]
-; EG: ADD_INT {{[* ]*}}T{{[0-9]}}.[[MAD_CHAN:[XYZW]]]
 ; The result must be sign-extended
-; EG: BFE_INT {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[MAD_CHAN]], 0.0, literal.x
-; EG: 8
-; GCN:	s_mul_i32 [[MUL:s[0-9]]], {{[s][0-9], [s][0-9]}}
-; GCN:	s_add_i32 [[MAD:s[0-9]]], [[MUL]], s{{[0-9]}}
-; GCN:	s_sext_i32_i8 [[EXT:s[0-9]]], [[MAD]]
-; GCN:	v_mov_b32_e32 v0, [[EXT]]
 define amdgpu_kernel void @i8_mad24(ptr addrspace(1) %out, i8 %a, i8 %b, i8 %c) {
+; EG-LABEL: i8_mad24:
+; EG:       ; %bb.0: ; %entry
+; EG-NEXT:    ALU 0, @12, KC0[], KC1[]
+; EG-NEXT:    TEX 2 @6
+; EG-NEXT:    ALU 4, @13, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
+; EG-NEXT:    CF_END
+; EG-NEXT:    PAD
+; EG-NEXT:    Fetch clause starting at 6:
+; EG-NEXT:     VTX_READ_8 T1.X, T0.X, 40, #3
+; EG-NEXT:     VTX_READ_8 T2.X, T0.X, 41, #3
+; EG-NEXT:     VTX_READ_8 T0.X, T0.X, 42, #3
+; EG-NEXT:    ALU clause starting at 12:
+; EG-NEXT:     MOV * T0.X, 0.0,
+; EG-NEXT:    ALU clause starting at 13:
+; EG-NEXT:     MULLO_INT * T0.Y, T1.X, T2.X,
+; EG-NEXT:     ADD_INT * T0.W, PS, T0.X,
+; EG-NEXT:     BFE_INT T0.X, PV.W, 0.0, literal.x,
+; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
+; EG-NEXT:    8(1.121039e-44), 2(2.802597e-45)
+;
+; CM-LABEL: i8_mad24:
+; CM:       ; %bb.0: ; %entry
+; CM-NEXT:    ALU 0, @12, KC0[], KC1[]
+; CM-NEXT:    TEX 2 @6
+; CM-NEXT:    ALU 8, @13, KC0[CB0:0-32], KC1[]
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X
+; CM-NEXT:    CF_END
+; CM-NEXT:    PAD
+; CM-NEXT:    Fetch clause starting at 6:
+; CM-NEXT:     VTX_READ_8 T1.X, T0.X, 40, #3
+; CM-NEXT:     VTX_READ_8 T2.X, T0.X, 41, #3
+; CM-NEXT:     VTX_READ_8 T0.X, T0.X, 42, #3
+; CM-NEXT:    ALU clause starting at 12:
+; CM-NEXT:     MOV * T0.X, 0.0,
+; CM-NEXT:    ALU clause starting at 13:
+; CM-NEXT:     MULLO_INT T0.X (MASKED), T1.X, T2.X,
+; CM-NEXT:     MULLO_INT T0.Y, T1.X, T2.X,
+; CM-NEXT:     MULLO_INT T0.Z (MASKED), T1.X, T2.X,
+; CM-NEXT:     MULLO_INT * T0.W (MASKED), T1.X, T2.X,
+; CM-NEXT:     ADD_INT * T0.W, PV.Y, T0.X,
+; CM-NEXT:     BFE_INT * T0.X, PV.W, 0.0, literal.x,
+; CM-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; CM-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
+; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+;
+; GCN-LABEL: i8_mad24:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_load_dword s2, s[4:5], 0xb
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_lshr_b32 s4, s2, 8
+; GCN-NEXT:    s_lshr_b32 s5, s2, 16
+; GCN-NEXT:    s_mul_i32 s2, s2, s4
+; GCN-NEXT:    s_add_i32 s2, s2, s5
+; GCN-NEXT:    s_sext_i32_i8 s4, s2
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT:    s_endpgm
+;
+; GFX8-LABEL: i8_mad24:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dword s6, s[4:5], 0x2c
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8-NEXT:    s_mov_b32 s3, 0xf000
+; GFX8-NEXT:    s_mov_b32 s2, -1
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_lshr_b32 s4, s6, 8
+; GFX8-NEXT:    s_lshr_b32 s5, s6, 16
+; GFX8-NEXT:    s_mul_i32 s4, s6, s4
+; GFX8-NEXT:    s_add_i32 s4, s4, s5
+; GFX8-NEXT:    s_sext_i32_i8 s4, s4
+; GFX8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX8-NEXT:    s_endpgm
 entry:
   %0 = mul i8 %a, %b
   %1 = add i8 %0, %c
@@ -72,11 +266,75 @@ entry:
 ; 24-bit mad pattern wasn't being matched.
 
 ; Check that the select instruction is not deleted.
-; FUNC-LABEL: {{^}}i24_i32_i32_mad:
-; EG: CNDE_INT
-; SI: s_cselect
-; GCN2: s_cselect
 define amdgpu_kernel void @i24_i32_i32_mad(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c, i32 %d) {
+; EG-LABEL: i24_i32_i32_mad:
+; EG:       ; %bb.0: ; %entry
+; EG-NEXT:    ALU 7, @4, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
+; EG-NEXT:    CF_END
+; EG-NEXT:    PAD
+; EG-NEXT:    ALU clause starting at 4:
+; EG-NEXT:     ASHR * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT:     CNDE_INT * T0.W, KC0[3].X, literal.x, PV.W,
+; EG-NEXT:    34(4.764415e-44), 0(0.000000e+00)
+; EG-NEXT:     MULLO_INT * T0.X, PV.W, KC0[3].X,
+; EG-NEXT:     ADD_INT T0.X, PS, KC0[3].Y,
+; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
+; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+;
+; CM-LABEL: i24_i32_i32_mad:
+; CM:       ; %bb.0: ; %entry
+; CM-NEXT:    ALU 10, @4, KC0[CB0:0-32], KC1[]
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X
+; CM-NEXT:    CF_END
+; CM-NEXT:    PAD
+; CM-NEXT:    ALU clause starting at 4:
+; CM-NEXT:     ASHR * T0.W, KC0[2].Z, literal.x,
+; CM-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; CM-NEXT:     CNDE_INT * T0.W, KC0[3].X, literal.x, PV.W,
+; CM-NEXT:    34(4.764415e-44), 0(0.000000e+00)
+; CM-NEXT:     MULLO_INT T0.X, T0.W, KC0[3].X,
+; CM-NEXT:     MULLO_INT T0.Y (MASKED), T0.W, KC0[3].X,
+; CM-NEXT:     MULLO_INT T0.Z (MASKED), T0.W, KC0[3].X,
+; CM-NEXT:     MULLO_INT * T0.W (MASKED), T0.W, KC0[3].X,
+; CM-NEXT:     ADD_INT * T0.X, PV.X, KC0[3].Y,
+; CM-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
+; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+;
+; GCN-LABEL: i24_i32_i32_mad:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_load_dword s2, s[4:5], 0xb
+; GCN-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xd
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_ashr_i32 s2, s2, 8
+; GCN-NEXT:    s_cmp_lg_u32 s6, 0
+; GCN-NEXT:    s_cselect_b32 s2, s2, 34
+; GCN-NEXT:    s_mul_i32 s2, s2, s6
+; GCN-NEXT:    s_add_i32 s4, s2, s7
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT:    s_endpgm
+;
+; GFX8-LABEL: i24_i32_i32_mad:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dword s8, s[4:5], 0x2c
+; GFX8-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8-NEXT:    s_mov_b32 s3, 0xf000
+; GFX8-NEXT:    s_mov_b32 s2, -1
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_ashr_i32 s4, s8, 8
+; GFX8-NEXT:    s_cmp_lg_u32 s6, 0
+; GFX8-NEXT:    s_cselect_b32 s4, s4, 34
+; GFX8-NEXT:    s_mul_i32 s4, s4, s6
+; GFX8-NEXT:    s_add_i32 s4, s4, s7
+; GFX8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX8-NEXT:    s_endpgm
 entry:
   %0 = ashr i32 %a, 8
   %1 = icmp ne i32 %c, 0
@@ -87,13 +345,139 @@ entry:
   ret void
 }
 
-; FUNC-LABEL: {{^}}extra_and:
-; SI-NOT: v_and
-; SI: s_mul_i32
-; SI: s_mul_i32
-; SI: s_add_i32
-; SI: s_add_i32
 define amdgpu_kernel void @extra_and(ptr addrspace(1) %arg, i32 %arg2, i32 %arg3) {
+; EG-LABEL: extra_and:
+; EG:       ; %bb.0: ; %bb
+; EG-NEXT:    ALU 5, @10, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    LOOP_START_DX10 @7
+; EG-NEXT:    ALU_PUSH_BEFORE 12, @16, KC0[], KC1[]
+; EG-NEXT:    JUMP @6 POP:1
+; EG-NEXT:    LOOP_BREAK @6
+; EG-NEXT:    POP @6 POP:1
+; EG-NEXT:    END_LOOP @2
+; EG-NEXT:    ALU 1, @29, KC0[], KC1[]
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
+; EG-NEXT:    CF_END
+; EG-NEXT:    ALU clause starting at 10:
+; EG-NEXT:     MOV * T1.W, literal.x,
+; EG-NEXT:    0(0.000000e+00), 0(0.000000e+00)
+; EG-NEXT:     MOV * T3.W, PV.W,
+; EG-NEXT:     MOV T0.Z, KC0[2].Y,
+; EG-NEXT:     MOV T0.W, KC0[2].Z,
+; EG-NEXT:     MOV * T2.W, KC0[2].W,
+; EG-NEXT:    ALU clause starting at 16:
+; EG-NEXT:     AND_INT T1.W, T1.W, literal.x,
+; EG-NEXT:     AND_INT * T4.W, T3.W, literal.x,
+; EG-NEXT:    16777215(2.350989e-38), 0(0.000000e+00)
+; EG-NEXT:     AND_INT T3.W, T3.W, literal.x,
+; EG-NEXT:     MULLO_INT * T0.X, PS, PV.W,
+; EG-NEXT:    16777215(2.350989e-38), 0(0.000000e+00)
+; EG-NEXT:     MULLO_INT * T0.Y, PV.W, T1.W,
+; EG-NEXT:     ADD_INT T3.W, T2.W, PS,
+; EG-NEXT:     ADD_INT * T1.W, T0.W, T0.X,
+; EG-NEXT:     ADD_INT * T0.X, PS, PV.W,
+; EG-NEXT:     SETNE_INT * T4.W, PV.X, literal.x,
+; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT:     PRED_SETE_INT * ExecMask,PredicateBit (MASKED), PV.W, 0.0,
+; EG-NEXT:    ALU clause starting at 29:
+; EG-NEXT:     LSHR * T1.X, T0.Z, literal.x,
+; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+;
+; CM-LABEL: extra_and:
+; CM:       ; %bb.0: ; %bb
+; CM-NEXT:    ALU 5, @10, KC0[CB0:0-32], KC1[]
+; CM-NEXT:    LOOP_START_DX10 @7
+; CM-NEXT:    ALU_PUSH_BEFORE 17, @16, KC0[], KC1[]
+; CM-NEXT:    JUMP @6 POP:1
+; CM-NEXT:    LOOP_BREAK @6
+; CM-NEXT:    POP @6 POP:1
+; CM-NEXT:    END_LOOP @2
+; CM-NEXT:    ALU 1, @34, KC0[], KC1[]
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X
+; CM-NEXT:    CF_END
+; CM-NEXT:    ALU clause starting at 10:
+; CM-NEXT:     MOV * T0.W, literal.x,
+; CM-NEXT:    0(0.000000e+00), 0(0.000000e+00)
+; CM-NEXT:     MOV * T1.Z, PV.W,
+; CM-NEXT:     MOV T0.Y, KC0[2].Y,
+; CM-NEXT:     MOV T0.Z, KC0[2].Z,
+; CM-NEXT:     MOV * T1.W, KC0[2].W,
+; CM-NEXT:    ALU clause starting at 16:
+; CM-NEXT:     AND_INT T1.Y, T1.Z, literal.x,
+; CM-NEXT:     AND_INT T2.Z, T0.W, literal.x,
+; CM-NEXT:     AND_INT * T0.W, T1.Z, literal.x,
+; CM-NEXT:    16777215(2.350989e-38), 0(0.000000e+00)
+; CM-NEXT:     MULLO_INT T0.X, T0.W, T2.Z,
+; CM-NEXT:     MULLO_INT T0.Y (MASKED), T0.W, T2.Z,
+; CM-NEXT:     MULLO_INT T0.Z (MASKED), T0.W, T2.Z,
+; CM-NEXT:     MULLO_INT * T0.W (MASKED), T0.W, T2.Z,
+; CM-NEXT:     MULLO_INT T0.X (MASKED), T1.Y, T2.Z,
+; CM-NEXT:     MULLO_INT T0.Y (MASKED), T1.Y, T2.Z,
+; CM-NEXT:     MULLO_INT T0.Z (MASKED), T1.Y, T2.Z,
+; CM-NEXT:     MULLO_INT * T0.W, T1.Y, T2.Z,
+; CM-NEXT:     ADD_INT T1.Z, T1.W, PV.W,
+; CM-NEXT:     ADD_INT * T0.W, T0.Z, T0.X,
+; CM-NEXT:     ADD_INT * T0.X, PV.W, PV.Z,
+; CM-NEXT:     SETNE_INT * T2.W, PV.X, literal.x,
+; CM-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; CM-NEXT:     PRED_SETE_INT * ExecMask,PredicateBit (MASKED), PV.W, 0.0,
+; CM-NEXT:    ALU clause starting at 34:
+; CM-NEXT:     LSHR * T1.X, T0.Y, literal.x,
+; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+;
+; GCN-LABEL: extra_and:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xb
+; GCN-NEXT:    s_mov_b32 s2, 0
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:  .LBB4_1: ; %bb4
+; GCN-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GCN-NEXT:    s_and_b32 s3, s6, 0xffffff
+; GCN-NEXT:    s_and_b32 s6, s6, 0xffffff
+; GCN-NEXT:    s_and_b32 s2, s2, 0xffffff
+; GCN-NEXT:    s_mul_i32 s3, s3, s2
+; GCN-NEXT:    s_mul_i32 s6, s6, s2
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_add_i32 s2, s0, s3
+; GCN-NEXT:    s_add_i32 s6, s1, s6
+; GCN-NEXT:    s_add_i32 s3, s2, s6
+; GCN-NEXT:    s_cmp_lg_u32 s3, 8
+; GCN-NEXT:    s_cbranch_scc1 .LBB4_1
+; GCN-NEXT:  ; %bb.2: ; %bb18
+; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, -1
+; GCN-NEXT:    v_mov_b32_e32 v0, s3
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GCN-NEXT:    s_endpgm
+;
+; GFX8-LABEL: extra_and:
+; GFX8:       ; %bb.0: ; %bb
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2c
+; GFX8-NEXT:    s_mov_b32 s2, 0
+; GFX8-NEXT:    s_mov_b32 s6, 0
+; GFX8-NEXT:  .LBB4_1: ; %bb4
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_and_b32 s3, s6, 0xffffff
+; GFX8-NEXT:    s_and_b32 s6, s6, 0xffffff
+; GFX8-NEXT:    s_and_b32 s2, s2, 0xffffff
+; GFX8-NEXT:    s_mul_i32 s3, s3, s2
+; GFX8-NEXT:    s_mul_i32 s6, s6, s2
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_add_i32 s2, s0, s3
+; GFX8-NEXT:    s_add_i32 s6, s1, s6
+; GFX8-NEXT:    s_add_i32 s3, s2, s6
+; GFX8-NEXT:    s_cmp_lg_u32 s3, 8
+; GFX8-NEXT:    s_cbranch_scc1 .LBB4_1
+; GFX8-NEXT:  ; %bb.2: ; %bb18
+; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x24
+; GFX8-NEXT:    s_mov_b32 s7, 0xf000
+; GFX8-NEXT:    s_mov_b32 s6, -1
+; GFX8-NEXT:    v_mov_b32_e32 v0, s3
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX8-NEXT:    s_endpgm
 bb:
   br label %bb4
 
@@ -119,13 +503,139 @@ bb18:                                             ; preds = %bb4
   ret void
 }
 
-; FUNC-LABEL: {{^}}dont_remove_shift
-; SI: s_lshr
-; SI: s_mul_i32
-; SI: s_mul_i32
-; SI: s_add_i32
-; SI: s_add_i32
 define amdgpu_kernel void @dont_remove_shift(ptr addrspace(1) %arg, i32 %arg2, i32 %arg3) {
+; EG-LABEL: dont_remove_shift:
+; EG:       ; %bb.0: ; %bb
+; EG-NEXT:    ALU 5, @10, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    LOOP_START_DX10 @7
+; EG-NEXT:    ALU_PUSH_BEFORE 12, @16, KC0[], KC1[]
+; EG-NEXT:    JUMP @6 POP:1
+; EG-NEXT:    LOOP_BREAK @6
+; EG-NEXT:    POP @6 POP:1
+; EG-NEXT:    END_LOOP @2
+; EG-NEXT:    ALU 1, @29, KC0[], KC1[]
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
+; EG-NEXT:    CF_END
+; EG-NEXT:    ALU clause starting at 10:
+; EG-NEXT:     MOV * T1.W, literal.x,
+; EG-NEXT:    0(0.000000e+00), 0(0.000000e+00)
+; EG-NEXT:     MOV * T3.W, PV.W,
+; EG-NEXT:     MOV T0.Z, KC0[2].Y,
+; EG-NEXT:     MOV T0.W, KC0[2].Z,
+; EG-NEXT:     MOV * T2.W, KC0[2].W,
+; EG-NEXT:    ALU clause starting at 16:
+; EG-NEXT:     LSHR T1.W, T1.W, literal.x,
+; EG-NEXT:     LSHR * T4.W, T3.W, literal.x,
+; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT:     LSHR T3.W, T3.W, literal.x,
+; EG-NEXT:     MULLO_INT * T0.X, PS, PV.W,
+; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT:     MULLO_INT * T0.Y, PV.W, T1.W,
+; EG-NEXT:     ADD_INT T3.W, T2.W, PS,
+; EG-NEXT:     ADD_INT * T1.W, T0.W, T0.X,
+; EG-NEXT:     ADD_INT * T0.X, PS, PV.W,
+; EG-NEXT:     SETNE_INT * T4.W, PV.X, literal.x,
+; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT:     PRED_SETE_INT * ExecMask,PredicateBit (MASKED), PV.W, 0.0,
+; EG-NEXT:    ALU clause starting at 29:
+; EG-NEXT:     LSHR * T1.X, T0.Z, literal.x,
+; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+;
+; CM-LABEL: dont_remove_shift:
+; CM:       ; %bb.0: ; %bb
+; CM-NEXT:    ALU 5, @10, KC0[CB0:0-32], KC1[]
+; CM-NEXT:    LOOP_START_DX10 @7
+; CM-NEXT:    ALU_PUSH_BEFORE 17, @16, KC0[], KC1[]
+; CM-NEXT:    JUMP @6 POP:1
+; CM-NEXT:    LOOP_BREAK @6
+; CM-NEXT:    POP @6 POP:1
+; CM-NEXT:    END_LOOP @2
+; CM-NEXT:    ALU 1, @34, KC0[], KC1[]
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X
+; CM-NEXT:    CF_END
+; CM-NEXT:    ALU clause starting at 10:
+; CM-NEXT:     MOV * T0.W, literal.x,
+; CM-NEXT:    0(0.000000e+00), 0(0.000000e+00)
+; CM-NEXT:     MOV * T1.Z, PV.W,
+; CM-NEXT:     MOV T0.Y, KC0[2].Y,
+; CM-NEXT:     MOV T0.Z, KC0[2].Z,
+; CM-NEXT:     MOV * T1.W, KC0[2].W,
+; CM-NEXT:    ALU clause starting at 16:
+; CM-NEXT:     LSHR T1.Y, T1.Z, literal.x,
+; CM-NEXT:     LSHR T2.Z, T0.W, literal.x,
+; CM-NEXT:     LSHR * T0.W, T1.Z, literal.x,
+; CM-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; CM-NEXT:     MULLO_INT T0.X, T0.W, T2.Z,
+; CM-NEXT:     MULLO_INT T0.Y (MASKED), T0.W, T2.Z,
+; CM-NEXT:     MULLO_INT T0.Z (MASKED), T0.W, T2.Z,
+; CM-NEXT:     MULLO_INT * T0.W (MASKED), T0.W, T2.Z,
+; CM-NEXT:     MULLO_INT T0.X (MASKED), T1.Y, T2.Z,
+; CM-NEXT:     MULLO_INT T0.Y (MASKED), T1.Y, T2.Z,
+; CM-NEXT:     MULLO_INT T0.Z (MASKED), T1.Y, T2.Z,
+; CM-NEXT:     MULLO_INT * T0.W, T1.Y, T2.Z,
+; CM-NEXT:     ADD_INT T1.Z, T1.W, PV.W,
+; CM-NEXT:     ADD_INT * T0.W, T0.Z, T0.X,
+; CM-NEXT:     ADD_INT * T0.X, PV.W, PV.Z,
+; CM-NEXT:     SETNE_INT * T2.W, PV.X, literal.x,
+; CM-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; CM-NEXT:     PRED_SETE_INT * ExecMask,PredicateBit (MASKED), PV.W, 0.0,
+; CM-NEXT:    ALU clause starting at 34:
+; CM-NEXT:     LSHR * T1.X, T0.Y, literal.x,
+; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+;
+; GCN-LABEL: dont_remove_shift:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xb
+; GCN-NEXT:    s_mov_b32 s2, 0
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:  .LBB5_1: ; %bb4
+; GCN-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GCN-NEXT:    s_lshr_b32 s3, s6, 8
+; GCN-NEXT:    s_lshr_b32 s6, s6, 8
+; GCN-NEXT:    s_lshr_b32 s2, s2, 8
+; GCN-NEXT:    s_mul_i32 s3, s3, s2
+; GCN-NEXT:    s_mul_i32 s6, s6, s2
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_add_i32 s2, s0, s3
+; GCN-NEXT:    s_add_i32 s6, s1, s6
+; GCN-NEXT:    s_add_i32 s3, s2, s6
+; GCN-NEXT:    s_cmp_lg_u32 s3, 8
+; GCN-NEXT:    s_cbranch_scc1 .LBB5_1
+; GCN-NEXT:  ; %bb.2: ; %bb18
+; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, -1
+; GCN-NEXT:    v_mov_b32_e32 v0, s3
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GCN-NEXT:    s_endpgm
+;
+; GFX8-LABEL: dont_remove_shift:
+; GFX8:       ; %bb.0: ; %bb
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2c
+; GFX8-NEXT:    s_mov_b32 s2, 0
+; GFX8-NEXT:    s_mov_b32 s6, 0
+; GFX8-NEXT:  .LBB5_1: ; %bb4
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_lshr_b32 s3, s6, 8
+; GFX8-NEXT:    s_lshr_b32 s6, s6, 8
+; GFX8-NEXT:    s_lshr_b32 s2, s2, 8
+; GFX8-NEXT:    s_mul_i32 s3, s3, s2
+; GFX8-NEXT:    s_mul_i32 s6, s6, s2
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_add_i32 s2, s0, s3
+; GFX8-NEXT:    s_add_i32 s6, s1, s6
+; GFX8-NEXT:    s_add_i32 s3, s2, s6
+; GFX8-NEXT:    s_cmp_lg_u32 s3, 8
+; GFX8-NEXT:    s_cbranch_scc1 .LBB5_1
+; GFX8-NEXT:  ; %bb.2: ; %bb18
+; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x24
+; GFX8-NEXT:    s_mov_b32 s7, 0xf000
+; GFX8-NEXT:    s_mov_b32 s6, -1
+; GFX8-NEXT:    v_mov_b32_e32 v0, s3
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX8-NEXT:    s_endpgm
 bb:
   br label %bb4
 
@@ -151,19 +661,234 @@ bb18:                                             ; preds = %bb4
   ret void
 }
 
-; FUNC-LABEL: {{^}}i8_mad_sat_16:
-; EG: MULLO_INT {{[* ]*}}T{{[0-9]}}.[[MAD_CHAN:[XYZW]]]
-; EG: ADD_INT {{[* ]*}}T{{[0-9]}}.[[MAD_CHAN:[XYZW]]]
-; The result must be sign-extended
-; EG: BFE_INT {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[MAD_CHAN]], 0.0, literal.x
-; EG: 8
-; SI: v_mad_u32_u24 [[MAD:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
-; SI: v_bfe_i32 [[EXT:v[0-9]]], [[MAD]], 0, 16
-; SI: v_med3_i32 v{{[0-9]}}, [[EXT]],
-; VI: v_mad_u16 [[MAD:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
-; VI: v_max_i16_e32 [[MAX:v[0-9]]], 0xff80, [[MAD]]
-; VI: v_min_i16_e32 {{v[0-9]}}, 0x7f, [[MAX]]
 define amdgpu_kernel void @i8_mad_sat_16(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %in2, ptr addrspace(5) %idx) {
+; EG-LABEL: i8_mad_sat_16:
+; EG:       ; %bb.0: ; %entry
+; EG-NEXT:    ALU 4, @14, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    TEX 0 @8
+; EG-NEXT:    ALU 1, @19, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    TEX 1 @10
+; EG-NEXT:    ALU 24, @21, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
+; EG-NEXT:    CF_END
+; EG-NEXT:    PAD
+; EG-NEXT:    Fetch clause starting at 8:
+; EG-NEXT:     VTX_READ_8 T1.X, T1.X, 0, #1
+; EG-NEXT:    Fetch clause starting at 10:
+; EG-NEXT:     VTX_READ_8 T3.X, T3.X, 0, #1
+; EG-NEXT:     VTX_READ_8 T2.X, T2.X, 0, #1
+; EG-NEXT:    ALU clause starting at 14:
+; EG-NEXT:     LSHR * T0.W, KC0[3].Y, literal.x,
+; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:     MOVA_INT * AR.x (MASKED), PV.W,
+; EG-NEXT:     MOV * T0.X, T(0 + AR.x).X+,
+; EG-NEXT:     ADD_INT * T1.X, KC0[2].W, PV.X,
+; EG-NEXT:    ALU clause starting at 19:
+; EG-NEXT:     ADD_INT T2.X, KC0[2].Z, T0.X,
+; EG-NEXT:     ADD_INT * T3.X, KC0[3].X, T0.X,
+; EG-NEXT:    ALU clause starting at 21:
+; EG-NEXT:     BFE_INT T0.Z, T1.X, 0.0, literal.x,
+; EG-NEXT:     BFE_INT * T0.W, T2.X, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT:     BFE_INT T1.W, T3.X, 0.0, literal.x,
+; EG-NEXT:     MULLO_INT * T0.Y, PV.Z, PV.W,
+; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT * T0.W, PS, PV.W,
+; EG-NEXT:     BFE_INT * T0.W, PV.W, 0.0, literal.x,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:     MAX_INT T0.W, PV.W, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, T0.X,
+; EG-NEXT:    -128(nan), 0(0.000000e+00)
+; EG-NEXT:     AND_INT T2.W, PS, literal.x,
+; EG-NEXT:     MIN_INT * T0.W, PV.W, literal.y,
+; EG-NEXT:    3(4.203895e-45), 127(1.779649e-43)
+; EG-NEXT:     AND_INT T0.W, PS, literal.x,
+; EG-NEXT:     LSHL * T2.W, PV.W, literal.y,
+; EG-NEXT:    255(3.573311e-43), 3(4.203895e-45)
+; EG-NEXT:     LSHL T0.X, PV.W, PS,
+; EG-NEXT:     LSHL * T0.W, literal.x, PS,
+; EG-NEXT:    255(3.573311e-43), 0(0.000000e+00)
+; EG-NEXT:     MOV T0.Y, 0.0,
+; EG-NEXT:     MOV * T0.Z, 0.0,
+; EG-NEXT:     LSHR * T1.X, T1.W, literal.x,
+; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+;
+; CM-LABEL: i8_mad_sat_16:
+; CM:       ; %bb.0: ; %entry
+; CM-NEXT:    ALU 4, @14, KC0[CB0:0-32], KC1[]
+; CM-NEXT:    TEX 0 @8
+; CM-NEXT:    ALU 1, @19, KC0[CB0:0-32], KC1[]
+; CM-NEXT:    TEX 1 @10
+; CM-NEXT:    ALU 26, @21, KC0[CB0:0-32], KC1[]
+; CM-NEXT:    MEM_RAT MSKOR T1.XW, T0.X
+; CM-NEXT:    CF_END
+; CM-NEXT:    PAD
+; CM-NEXT:    Fetch clause starting at 8:
+; CM-NEXT:     VTX_READ_8 T1.X, T1.X, 0, #1
+; CM-NEXT:    Fetch clause starting at 10:
+; CM-NEXT:     VTX_READ_8 T3.X, T3.X, 0, #1
+; CM-NEXT:     VTX_READ_8 T2.X, T2.X, 0, #1
+; CM-NEXT:    ALU clause starting at 14:
+; CM-NEXT:     LSHR * T0.W, KC0[3].Y, literal.x,
+; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; CM-NEXT:     MOVA_INT * AR.x (MASKED), PV.W,
+; CM-NEXT:     MOV * T0.X, T(0 + AR.x).X+,
+; CM-NEXT:     ADD_INT * T1.X, KC0[3].X, PV.X,
+; CM-NEXT:    ALU clause starting at 19:
+; CM-NEXT:     ADD_INT * T2.X, KC0[2].W, T0.X,
+; CM-NEXT:     ADD_INT * T3.X, KC0[2].Z, T0.X,
+; CM-NEXT:    ALU clause starting at 21:
+; CM-NEXT:     BFE_INT T0.Y, T1.X, 0.0, literal.x,
+; CM-NEXT:     BFE_INT T0.Z, T2.X, 0.0, literal.x, BS:VEC_120/SCL_212
+; CM-NEXT:     BFE_INT * T0.W, T3.X, 0.0, literal.x, BS:VEC_201
+; CM-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; CM-NEXT:     MULLO_INT T0.X (MASKED), T0.Z, T0.W,
+; CM-NEXT:     MULLO_INT T0.Y (MASKED), T0.Z, T0.W,
+; CM-NEXT:     MULLO_INT T0.Z, T0.Z, T0.W,
+; CM-NEXT:     MULLO_INT * T0.W (MASKED), T0.Z, T0.W,
+; CM-NEXT:     ADD_INT * T0.W, PV.Z, T0.Y,
+; CM-NEXT:     BFE_INT * T0.W, PV.W, 0.0, literal.x,
+; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; CM-NEXT:     MAX_INT T0.Z, PV.W, literal.x,
+; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, T0.X,
+; CM-NEXT:    -128(nan), 0(0.000000e+00)
+; CM-NEXT:     AND_INT T1.Z, PV.W, literal.x,
+; CM-NEXT:     MIN_INT * T1.W, PV.Z, literal.y,
+; CM-NEXT:    3(4.203895e-45), 127(1.779649e-43)
+; CM-NEXT:     AND_INT T0.Z, PV.W, literal.x,
+; CM-NEXT:     LSHL * T1.W, PV.Z, literal.y,
+; CM-NEXT:    255(3.573311e-43), 3(4.203895e-45)
+; CM-NEXT:     LSHL T1.X, PV.Z, PV.W,
+; CM-NEXT:     LSHL * T1.W, literal.x, PV.W,
+; CM-NEXT:    255(3.573311e-43), 0(0.000000e+00)
+; CM-NEXT:     MOV T1.Y, 0.0,
+; CM-NEXT:     MOV * T1.Z, 0.0,
+; CM-NEXT:     LSHR * T0.X, T0.W, literal.x,
+; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+;
+; GCN-LABEL: i8_mad_sat_16:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_mov_b32 s20, SCRATCH_RSRC_DWORD0
+; GCN-NEXT:    s_mov_b32 s21, SCRATCH_RSRC_DWORD1
+; GCN-NEXT:    s_mov_b32 s22, -1
+; GCN-NEXT:    s_mov_b32 s23, 0xe8f000
+; GCN-NEXT:    s_add_u32 s20, s20, s11
+; GCN-NEXT:    s_addc_u32 s21, s21, 0
+; GCN-NEXT:    s_load_dword s8, s[4:5], 0x11
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_add_i32 s9, s8, 4
+; GCN-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x9
+; GCN-NEXT:    v_mov_b32_e32 v0, s8
+; GCN-NEXT:    v_mov_b32_e32 v1, s9
+; GCN-NEXT:    buffer_load_dword v1, v1, s[20:23], 0 offen
+; GCN-NEXT:    buffer_load_dword v0, v0, s[20:23], 0 offen
+; GCN-NEXT:    s_mov_b32 s11, 0xf000
+; GCN-NEXT:    s_mov_b32 s10, 0
+; GCN-NEXT:    s_mov_b64 s[14:15], s[10:11]
+; GCN-NEXT:    s_mov_b64 s[18:19], s[10:11]
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GCN-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; GCN-NEXT:    s_mov_b64 s[16:17], s[6:7]
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_load_sbyte v2, v[0:1], s[12:15], 0 addr64
+; GCN-NEXT:    buffer_load_sbyte v3, v[0:1], s[8:11], 0 addr64
+; GCN-NEXT:    buffer_load_sbyte v4, v[0:1], s[16:19], 0 addr64
+; GCN-NEXT:    s_movk_i32 s2, 0xff80
+; GCN-NEXT:    s_waitcnt vmcnt(2)
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mad_u32_u24 v2, v2, v3, v4
+; GCN-NEXT:    v_bfe_i32 v2, v2, 0, 16
+; GCN-NEXT:    v_mov_b32_e32 v3, 0x7f
+; GCN-NEXT:    v_med3_i32 v2, v2, s2, v3
+; GCN-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; GCN-NEXT:    buffer_store_byte v2, v[0:1], s[0:3], 0 addr64
+; GCN-NEXT:    s_endpgm
+;
+; SI-LABEL: i8_mad_sat_16:
+; SI:       ; %bb.0: ; %entry
+; SI-NEXT:    s_mov_b32 s88, SCRATCH_RSRC_DWORD0
+; SI-NEXT:    s_load_dword s0, s[4:5], 0x44
+; SI-NEXT:    s_mov_b32 s89, SCRATCH_RSRC_DWORD1
+; SI-NEXT:    s_mov_b32 s90, -1
+; SI-NEXT:    s_mov_b32 s91, 0xe80000
+; SI-NEXT:    s_add_u32 s88, s88, s11
+; SI-NEXT:    s_addc_u32 s89, s89, 0
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_add_i32 s1, s0, 4
+; SI-NEXT:    v_mov_b32_e32 v0, s0
+; SI-NEXT:    buffer_load_dword v6, v0, s[88:91], 0 offen
+; SI-NEXT:    v_mov_b32_e32 v0, s1
+; SI-NEXT:    buffer_load_dword v7, v0, s[88:91], 0 offen
+; SI-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x24
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v1, s3
+; SI-NEXT:    v_mov_b32_e32 v3, s5
+; SI-NEXT:    v_mov_b32_e32 v5, s7
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_add_u32_e32 v0, vcc, s2, v6
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_addc_u32_e32 v1, vcc, v1, v7, vcc
+; SI-NEXT:    v_add_u32_e32 v2, vcc, s4, v6
+; SI-NEXT:    v_addc_u32_e32 v3, vcc, v3, v7, vcc
+; SI-NEXT:    v_add_u32_e32 v4, vcc, s6, v6
+; SI-NEXT:    v_addc_u32_e32 v5, vcc, v5, v7, vcc
+; SI-NEXT:    flat_load_sbyte v0, v[0:1]
+; SI-NEXT:    flat_load_sbyte v1, v[2:3]
+; SI-NEXT:    flat_load_sbyte v2, v[4:5]
+; SI-NEXT:    v_mov_b32_e32 v3, s1
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_mad_u16 v0, v1, v0, v2
+; SI-NEXT:    v_max_i16_e32 v0, 0xff80, v0
+; SI-NEXT:    v_min_i16_e32 v2, 0x7f, v0
+; SI-NEXT:    v_add_u32_e32 v0, vcc, s0, v6
+; SI-NEXT:    v_addc_u32_e32 v1, vcc, v3, v7, vcc
+; SI-NEXT:    flat_store_byte v[0:1], v2
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: i8_mad_sat_16:
+; VI:       ; %bb.0: ; %entry
+; VI-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
+; VI-NEXT:    s_load_dword s0, s[4:5], 0x44
+; VI-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
+; VI-NEXT:    s_mov_b32 s14, -1
+; VI-NEXT:    s_mov_b32 s15, 0xe80000
+; VI-NEXT:    s_add_u32 s12, s12, s11
+; VI-NEXT:    s_addc_u32 s13, s13, 0
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_add_i32 s1, s0, 4
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    buffer_load_dword v6, v0, s[12:15], 0 offen
+; VI-NEXT:    v_mov_b32_e32 v0, s1
+; VI-NEXT:    buffer_load_dword v7, v0, s[12:15], 0 offen
+; VI-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x24
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_mov_b32_e32 v3, s5
+; VI-NEXT:    v_mov_b32_e32 v5, s7
+; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v6
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_addc_u32_e32 v1, vcc, v1, v7, vcc
+; VI-NEXT:    v_add_u32_e32 v2, vcc, s4, v6
+; VI-NEXT:    v_addc_u32_e32 v3, vcc, v3, v7, vcc
+; VI-NEXT:    v_add_u32_e32 v4, vcc, s6, v6
+; VI-NEXT:    v_addc_u32_e32 v5, vcc, v5, v7, vcc
+; VI-NEXT:    flat_load_sbyte v0, v[0:1]
+; VI-NEXT:    flat_load_sbyte v1, v[2:3]
+; VI-NEXT:    flat_load_sbyte v2, v[4:5]
+; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_mad_u16 v0, v1, v0, v2
+; VI-NEXT:    v_max_i16_e32 v0, 0xff80, v0
+; VI-NEXT:    v_min_i16_e32 v2, 0x7f, v0
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v6
+; VI-NEXT:    v_addc_u32_e32 v1, vcc, v3, v7, vcc
+; VI-NEXT:    flat_store_byte v[0:1], v2
+; VI-NEXT:    s_endpgm
 entry:
   %retval.0.i = load i64, ptr addrspace(5) %idx
   %arrayidx = getelementptr inbounds i8, ptr addrspace(1) %in0, i64 %retval.0.i
@@ -187,16 +912,201 @@ entry:
   ret void
 }
 
-; FUNC-LABEL: {{^}}i8_mad_32:
-; EG: MULLO_INT {{[* ]*}}T{{[0-9]}}.[[MAD_CHAN:[XYZW]]]
-; EG: ADD_INT {{[* ]*}}T{{[0-9]}}.[[MAD_CHAN:[XYZW]]]
-; The result must be sign-extended
-; EG: BFE_INT {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[MAD_CHAN]], 0.0, literal.x
-; EG: 8
-; SI: v_mad_u32_u24 [[MAD:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
-; VI: v_mad_u16 [[MAD:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
-; GCN: v_bfe_i32 [[EXT:v[0-9]]], [[MAD]], 0, 16
 define amdgpu_kernel void @i8_mad_32(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(5) %idx) {
+; EG-LABEL: i8_mad_32:
+; EG:       ; %bb.0: ; %entry
+; EG-NEXT:    ALU 4, @14, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    TEX 0 @8
+; EG-NEXT:    ALU 1, @19, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    TEX 1 @10
+; EG-NEXT:    ALU 9, @21, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
+; EG-NEXT:    CF_END
+; EG-NEXT:    PAD
+; EG-NEXT:    Fetch clause starting at 8:
+; EG-NEXT:     VTX_READ_8 T1.X, T1.X, 0, #1
+; EG-NEXT:    Fetch clause starting at 10:
+; EG-NEXT:     VTX_READ_8 T0.X, T0.X, 0, #1
+; EG-NEXT:     VTX_READ_8 T2.X, T2.X, 0, #1
+; EG-NEXT:    ALU clause starting at 14:
+; EG-NEXT:     LSHR * T0.W, KC0[3].Y, literal.x,
+; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:     MOVA_INT * AR.x (MASKED), PV.W,
+; EG-NEXT:     MOV * T0.X, T(0 + AR.x).X+,
+; EG-NEXT:     ADD_INT * T1.X, KC0[2].W, PV.X,
+; EG-NEXT:    ALU clause starting at 19:
+; EG-NEXT:     ADD_INT T2.X, KC0[2].Z, T0.X,
+; EG-NEXT:     ADD_INT * T0.X, KC0[3].X, T0.X,
+; EG-NEXT:    ALU clause starting at 21:
+; EG-NEXT:     BFE_INT T0.Z, T1.X, 0.0, literal.x,
+; EG-NEXT:     BFE_INT * T0.W, T2.X, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT:     BFE_INT T1.W, T0.X, 0.0, literal.x,
+; EG-NEXT:     MULLO_INT * T0.X, PV.W, PV.Z,
+; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT * T0.W, PS, PV.W,
+; EG-NEXT:     BFE_INT T0.X, PV.W, 0.0, literal.x,
+; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 2(2.802597e-45)
+;
+; CM-LABEL: i8_mad_32:
+; CM:       ; %bb.0: ; %entry
+; CM-NEXT:    ALU 4, @14, KC0[CB0:0-32], KC1[]
+; CM-NEXT:    TEX 0 @8
+; CM-NEXT:    ALU 1, @19, KC0[CB0:0-32], KC1[]
+; CM-NEXT:    TEX 1 @10
+; CM-NEXT:    ALU 12, @21, KC0[CB0:0-32], KC1[]
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X
+; CM-NEXT:    CF_END
+; CM-NEXT:    PAD
+; CM-NEXT:    Fetch clause starting at 8:
+; CM-NEXT:     VTX_READ_8 T1.X, T1.X, 0, #1
+; CM-NEXT:    Fetch clause starting at 10:
+; CM-NEXT:     VTX_READ_8 T0.X, T0.X, 0, #1
+; CM-NEXT:     VTX_READ_8 T2.X, T2.X, 0, #1
+; CM-NEXT:    ALU clause starting at 14:
+; CM-NEXT:     LSHR * T0.W, KC0[3].Y, literal.x,
+; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; CM-NEXT:     MOVA_INT * AR.x (MASKED), PV.W,
+; CM-NEXT:     MOV * T0.X, T(0 + AR.x).X+,
+; CM-NEXT:     ADD_INT * T1.X, KC0[3].X, PV.X,
+; CM-NEXT:    ALU clause starting at 19:
+; CM-NEXT:     ADD_INT * T2.X, KC0[2].W, T0.X,
+; CM-NEXT:     ADD_INT * T0.X, KC0[2].Z, T0.X,
+; CM-NEXT:    ALU clause starting at 21:
+; CM-NEXT:     BFE_INT T0.Y, T1.X, 0.0, literal.x,
+; CM-NEXT:     BFE_INT T0.Z, T2.X, 0.0, literal.x, BS:VEC_120/SCL_212
+; CM-NEXT:     BFE_INT * T0.W, T0.X, 0.0, literal.x, BS:VEC_201
+; CM-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; CM-NEXT:     MULLO_INT T0.X, T0.W, T0.Z,
+; CM-NEXT:     MULLO_INT T0.Y (MASKED), T0.W, T0.Z,
+; CM-NEXT:     MULLO_INT T0.Z (MASKED), T0.W, T0.Z,
+; CM-NEXT:     MULLO_INT * T0.W (MASKED), T0.W, T0.Z,
+; CM-NEXT:     ADD_INT * T0.W, PV.X, T0.Y,
+; CM-NEXT:     BFE_INT * T0.X, PV.W, 0.0, literal.x,
+; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; CM-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
+; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+;
+; GCN-LABEL: i8_mad_32:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_mov_b32 s24, SCRATCH_RSRC_DWORD0
+; GCN-NEXT:    s_mov_b32 s25, SCRATCH_RSRC_DWORD1
+; GCN-NEXT:    s_mov_b32 s26, -1
+; GCN-NEXT:    s_mov_b32 s27, 0xe8f000
+; GCN-NEXT:    s_add_u32 s24, s24, s11
+; GCN-NEXT:    s_addc_u32 s25, s25, 0
+; GCN-NEXT:    s_load_dword s8, s[4:5], 0x11
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_add_i32 s9, s8, 4
+; GCN-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x9
+; GCN-NEXT:    v_mov_b32_e32 v0, s8
+; GCN-NEXT:    v_mov_b32_e32 v1, s9
+; GCN-NEXT:    buffer_load_dword v1, v1, s[24:27], 0 offen
+; GCN-NEXT:    buffer_load_dword v0, v0, s[24:27], 0 offen
+; GCN-NEXT:    s_mov_b32 s11, 0xf000
+; GCN-NEXT:    s_mov_b32 s14, 0
+; GCN-NEXT:    s_mov_b32 s15, s11
+; GCN-NEXT:    s_mov_b64 s[18:19], s[14:15]
+; GCN-NEXT:    s_mov_b64 s[22:23], s[14:15]
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GCN-NEXT:    s_mov_b64 s[16:17], s[4:5]
+; GCN-NEXT:    s_mov_b64 s[20:21], s[6:7]
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_load_sbyte v2, v[0:1], s[12:15], 0 addr64
+; GCN-NEXT:    buffer_load_sbyte v3, v[0:1], s[16:19], 0 addr64
+; GCN-NEXT:    buffer_load_sbyte v0, v[0:1], s[20:23], 0 addr64
+; GCN-NEXT:    s_mov_b32 s10, -1
+; GCN-NEXT:    s_mov_b32 s8, s0
+; GCN-NEXT:    s_mov_b32 s9, s1
+; GCN-NEXT:    s_waitcnt vmcnt(2)
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff, v2
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff, v3
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mad_u32_u24 v0, v1, v2, v0
+; GCN-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; GCN-NEXT:    buffer_store_dword v0, off, s[8:11], 0
+; GCN-NEXT:    s_endpgm
+;
+; SI-LABEL: i8_mad_32:
+; SI:       ; %bb.0: ; %entry
+; SI-NEXT:    s_mov_b32 s88, SCRATCH_RSRC_DWORD0
+; SI-NEXT:    s_load_dword s0, s[4:5], 0x44
+; SI-NEXT:    s_mov_b32 s89, SCRATCH_RSRC_DWORD1
+; SI-NEXT:    s_mov_b32 s90, -1
+; SI-NEXT:    s_mov_b32 s91, 0xe80000
+; SI-NEXT:    s_add_u32 s88, s88, s11
+; SI-NEXT:    s_addc_u32 s89, s89, 0
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_add_i32 s1, s0, 4
+; SI-NEXT:    v_mov_b32_e32 v0, s0
+; SI-NEXT:    buffer_load_dword v4, v0, s[88:91], 0 offen
+; SI-NEXT:    v_mov_b32_e32 v0, s1
+; SI-NEXT:    buffer_load_dword v5, v0, s[88:91], 0 offen
+; SI-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x24
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v1, s3
+; SI-NEXT:    v_mov_b32_e32 v3, s5
+; SI-NEXT:    v_mov_b32_e32 v6, s7
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_add_u32_e32 v0, vcc, s2, v4
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_addc_u32_e32 v1, vcc, v1, v5, vcc
+; SI-NEXT:    v_add_u32_e32 v2, vcc, s4, v4
+; SI-NEXT:    v_addc_u32_e32 v3, vcc, v3, v5, vcc
+; SI-NEXT:    v_add_u32_e32 v4, vcc, s6, v4
+; SI-NEXT:    v_addc_u32_e32 v5, vcc, v6, v5, vcc
+; SI-NEXT:    flat_load_sbyte v0, v[0:1]
+; SI-NEXT:    flat_load_sbyte v1, v[2:3]
+; SI-NEXT:    flat_load_sbyte v2, v[4:5]
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_mad_u16 v0, v0, v1, v2
+; SI-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: i8_mad_32:
+; VI:       ; %bb.0: ; %entry
+; VI-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
+; VI-NEXT:    s_load_dword s0, s[4:5], 0x44
+; VI-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
+; VI-NEXT:    s_mov_b32 s14, -1
+; VI-NEXT:    s_mov_b32 s15, 0xe80000
+; VI-NEXT:    s_add_u32 s12, s12, s11
+; VI-NEXT:    s_addc_u32 s13, s13, 0
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_add_i32 s1, s0, 4
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    buffer_load_dword v4, v0, s[12:15], 0 offen
+; VI-NEXT:    v_mov_b32_e32 v0, s1
+; VI-NEXT:    buffer_load_dword v5, v0, s[12:15], 0 offen
+; VI-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x24
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_mov_b32_e32 v3, s5
+; VI-NEXT:    v_mov_b32_e32 v6, s7
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v4
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_addc_u32_e32 v1, vcc, v1, v5, vcc
+; VI-NEXT:    v_add_u32_e32 v2, vcc, s4, v4
+; VI-NEXT:    v_addc_u32_e32 v3, vcc, v3, v5, vcc
+; VI-NEXT:    v_add_u32_e32 v4, vcc, s6, v4
+; VI-NEXT:    v_addc_u32_e32 v5, vcc, v6, v5, vcc
+; VI-NEXT:    flat_load_sbyte v0, v[0:1]
+; VI-NEXT:    flat_load_sbyte v1, v[2:3]
+; VI-NEXT:    flat_load_sbyte v2, v[4:5]
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_mad_u16 v0, v0, v1, v2
+; VI-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT:    s_endpgm
 entry:
   %retval.0.i = load i64, ptr addrspace(5) %idx
   %arrayidx = getelementptr inbounds i8, ptr addrspace(1) %a, i64 %retval.0.i
@@ -215,16 +1125,207 @@ entry:
   ret void
 }
 
-; FUNC-LABEL: {{^}}i8_mad_64:
-; EG: MULLO_INT {{[* ]*}}T{{[0-9]}}.[[MAD_CHAN:[XYZW]]]
-; EG: ADD_INT {{[* ]*}}T{{[0-9]}}.[[MAD_CHAN:[XYZW]]]
-; The result must be sign-extended
-; EG: BFE_INT {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[MAD_CHAN]], 0.0, literal.x
-; EG: 8
-; SI: v_mad_u32_u24 [[MAD:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
-; VI: v_mad_u16 [[MAD:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
-; GCN: v_bfe_i32 [[EXT:v[0-9]]], [[MAD]], 0, 16
 define amdgpu_kernel void @i8_mad_64(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(5) %idx) {
+; EG-LABEL: i8_mad_64:
+; EG:       ; %bb.0: ; %entry
+; EG-NEXT:    ALU 4, @14, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    TEX 0 @8
+; EG-NEXT:    ALU 1, @19, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    TEX 1 @10
+; EG-NEXT:    ALU 11, @21, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
+; EG-NEXT:    CF_END
+; EG-NEXT:    PAD
+; EG-NEXT:    Fetch clause starting at 8:
+; EG-NEXT:     VTX_READ_8 T1.X, T1.X, 0, #1
+; EG-NEXT:    Fetch clause starting at 10:
+; EG-NEXT:     VTX_READ_8 T0.X, T0.X, 0, #1
+; EG-NEXT:     VTX_READ_8 T2.X, T2.X, 0, #1
+; EG-NEXT:    ALU clause starting at 14:
+; EG-NEXT:     LSHR * T0.W, KC0[3].Y, literal.x,
+; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:     MOVA_INT * AR.x (MASKED), PV.W,
+; EG-NEXT:     MOV * T0.X, T(0 + AR.x).X+,
+; EG-NEXT:     ADD_INT * T1.X, KC0[2].W, PV.X,
+; EG-NEXT:    ALU clause starting at 19:
+; EG-NEXT:     ADD_INT T2.X, KC0[2].Z, T0.X,
+; EG-NEXT:     ADD_INT * T0.X, KC0[3].X, T0.X,
+; EG-NEXT:    ALU clause starting at 21:
+; EG-NEXT:     BFE_INT T0.Z, T1.X, 0.0, literal.x,
+; EG-NEXT:     BFE_INT * T0.W, T2.X, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT:     BFE_INT T1.W, T0.X, 0.0, literal.x,
+; EG-NEXT:     MULLO_INT * T0.X, PV.W, PV.Z,
+; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT * T0.W, PS, PV.W,
+; EG-NEXT:     BFE_INT T0.X, PV.W, 0.0, literal.x,
+; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 2(2.802597e-45)
+; EG-NEXT:     ASHR * T0.Y, PV.X, literal.x,
+; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
+;
+; CM-LABEL: i8_mad_64:
+; CM:       ; %bb.0: ; %entry
+; CM-NEXT:    ALU 4, @14, KC0[CB0:0-32], KC1[]
+; CM-NEXT:    TEX 0 @8
+; CM-NEXT:    ALU 1, @19, KC0[CB0:0-32], KC1[]
+; CM-NEXT:    TEX 1 @10
+; CM-NEXT:    ALU 13, @21, KC0[CB0:0-32], KC1[]
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0, T1.X
+; CM-NEXT:    CF_END
+; CM-NEXT:    PAD
+; CM-NEXT:    Fetch clause starting at 8:
+; CM-NEXT:     VTX_READ_8 T1.X, T1.X, 0, #1
+; CM-NEXT:    Fetch clause starting at 10:
+; CM-NEXT:     VTX_READ_8 T0.X, T0.X, 0, #1
+; CM-NEXT:     VTX_READ_8 T2.X, T2.X, 0, #1
+; CM-NEXT:    ALU clause starting at 14:
+; CM-NEXT:     LSHR * T0.W, KC0[3].Y, literal.x,
+; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; CM-NEXT:     MOVA_INT * AR.x (MASKED), PV.W,
+; CM-NEXT:     MOV * T0.X, T(0 + AR.x).X+,
+; CM-NEXT:     ADD_INT * T1.X, KC0[3].X, PV.X,
+; CM-NEXT:    ALU clause starting at 19:
+; CM-NEXT:     ADD_INT * T2.X, KC0[2].W, T0.X,
+; CM-NEXT:     ADD_INT * T0.X, KC0[2].Z, T0.X,
+; CM-NEXT:    ALU clause starting at 21:
+; CM-NEXT:     BFE_INT T0.Y, T1.X, 0.0, literal.x,
+; CM-NEXT:     BFE_INT T0.Z, T2.X, 0.0, literal.x, BS:VEC_120/SCL_212
+; CM-NEXT:     BFE_INT * T0.W, T0.X, 0.0, literal.x, BS:VEC_201
+; CM-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; CM-NEXT:     MULLO_INT T0.X, T0.W, T0.Z,
+; CM-NEXT:     MULLO_INT T0.Y (MASKED), T0.W, T0.Z,
+; CM-NEXT:     MULLO_INT T0.Z (MASKED), T0.W, T0.Z,
+; CM-NEXT:     MULLO_INT * T0.W (MASKED), T0.W, T0.Z,
+; CM-NEXT:     ADD_INT * T0.W, PV.X, T0.Y,
+; CM-NEXT:     BFE_INT * T0.X, PV.W, 0.0, literal.x,
+; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; CM-NEXT:     LSHR T1.X, KC0[2].Y, literal.x,
+; CM-NEXT:     ASHR * T0.Y, PV.X, literal.y,
+; CM-NEXT:    2(2.802597e-45), 31(4.344025e-44)
+;
+; GCN-LABEL: i8_mad_64:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_mov_b32 s24, SCRATCH_RSRC_DWORD0
+; GCN-NEXT:    s_mov_b32 s25, SCRATCH_RSRC_DWORD1
+; GCN-NEXT:    s_mov_b32 s26, -1
+; GCN-NEXT:    s_mov_b32 s27, 0xe8f000
+; GCN-NEXT:    s_add_u32 s24, s24, s11
+; GCN-NEXT:    s_addc_u32 s25, s25, 0
+; GCN-NEXT:    s_load_dword s8, s[4:5], 0x11
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_add_i32 s9, s8, 4
+; GCN-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x9
+; GCN-NEXT:    v_mov_b32_e32 v0, s8
+; GCN-NEXT:    v_mov_b32_e32 v1, s9
+; GCN-NEXT:    buffer_load_dword v1, v1, s[24:27], 0 offen
+; GCN-NEXT:    buffer_load_dword v0, v0, s[24:27], 0 offen
+; GCN-NEXT:    s_mov_b32 s11, 0xf000
+; GCN-NEXT:    s_mov_b32 s14, 0
+; GCN-NEXT:    s_mov_b32 s15, s11
+; GCN-NEXT:    s_mov_b64 s[18:19], s[14:15]
+; GCN-NEXT:    s_mov_b64 s[22:23], s[14:15]
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_mov_b64 s[12:13], s[2:3]
+; GCN-NEXT:    s_mov_b64 s[16:17], s[4:5]
+; GCN-NEXT:    s_mov_b64 s[20:21], s[6:7]
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_load_sbyte v2, v[0:1], s[12:15], 0 addr64
+; GCN-NEXT:    buffer_load_sbyte v3, v[0:1], s[16:19], 0 addr64
+; GCN-NEXT:    buffer_load_sbyte v0, v[0:1], s[20:23], 0 addr64
+; GCN-NEXT:    s_mov_b32 s10, -1
+; GCN-NEXT:    s_mov_b32 s8, s0
+; GCN-NEXT:    s_mov_b32 s9, s1
+; GCN-NEXT:    s_waitcnt vmcnt(2)
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff, v2
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff, v3
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mad_u32_u24 v0, v1, v2, v0
+; GCN-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; GCN-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
+; GCN-NEXT:    s_endpgm
+;
+; SI-LABEL: i8_mad_64:
+; SI:       ; %bb.0: ; %entry
+; SI-NEXT:    s_mov_b32 s88, SCRATCH_RSRC_DWORD0
+; SI-NEXT:    s_load_dword s0, s[4:5], 0x44
+; SI-NEXT:    s_mov_b32 s89, SCRATCH_RSRC_DWORD1
+; SI-NEXT:    s_mov_b32 s90, -1
+; SI-NEXT:    s_mov_b32 s91, 0xe80000
+; SI-NEXT:    s_add_u32 s88, s88, s11
+; SI-NEXT:    s_addc_u32 s89, s89, 0
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_add_i32 s1, s0, 4
+; SI-NEXT:    v_mov_b32_e32 v0, s0
+; SI-NEXT:    buffer_load_dword v4, v0, s[88:91], 0 offen
+; SI-NEXT:    v_mov_b32_e32 v0, s1
+; SI-NEXT:    buffer_load_dword v5, v0, s[88:91], 0 offen
+; SI-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x24
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v1, s3
+; SI-NEXT:    v_mov_b32_e32 v3, s5
+; SI-NEXT:    v_mov_b32_e32 v6, s7
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_add_u32_e32 v0, vcc, s2, v4
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_addc_u32_e32 v1, vcc, v1, v5, vcc
+; SI-NEXT:    v_add_u32_e32 v2, vcc, s4, v4
+; SI-NEXT:    v_addc_u32_e32 v3, vcc, v3, v5, vcc
+; SI-NEXT:    v_add_u32_e32 v4, vcc, s6, v4
+; SI-NEXT:    v_addc_u32_e32 v5, vcc, v6, v5, vcc
+; SI-NEXT:    flat_load_sbyte v0, v[0:1]
+; SI-NEXT:    flat_load_sbyte v1, v[2:3]
+; SI-NEXT:    flat_load_sbyte v2, v[4:5]
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_mad_u16 v0, v0, v1, v2
+; SI-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; SI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: i8_mad_64:
+; VI:       ; %bb.0: ; %entry
+; VI-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
+; VI-NEXT:    s_load_dword s0, s[4:5], 0x44
+; VI-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
+; VI-NEXT:    s_mov_b32 s14, -1
+; VI-NEXT:    s_mov_b32 s15, 0xe80000
+; VI-NEXT:    s_add_u32 s12, s12, s11
+; VI-NEXT:    s_addc_u32 s13, s13, 0
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_add_i32 s1, s0, 4
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    buffer_load_dword v4, v0, s[12:15], 0 offen
+; VI-NEXT:    v_mov_b32_e32 v0, s1
+; VI-NEXT:    buffer_load_dword v5, v0, s[12:15], 0 offen
+; VI-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x24
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_mov_b32_e32 v3, s5
+; VI-NEXT:    v_mov_b32_e32 v6, s7
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v4
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_addc_u32_e32 v1, vcc, v1, v5, vcc
+; VI-NEXT:    v_add_u32_e32 v2, vcc, s4, v4
+; VI-NEXT:    v_addc_u32_e32 v3, vcc, v3, v5, vcc
+; VI-NEXT:    v_add_u32_e32 v4, vcc, s6, v4
+; VI-NEXT:    v_addc_u32_e32 v5, vcc, v6, v5, vcc
+; VI-NEXT:    flat_load_sbyte v0, v[0:1]
+; VI-NEXT:    flat_load_sbyte v1, v[2:3]
+; VI-NEXT:    flat_load_sbyte v2, v[4:5]
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_mad_u16 v0, v0, v1, v2
+; VI-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; VI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT:    s_endpgm
 entry:
   %retval.0.i = load i64, ptr addrspace(5) %idx
   %arrayidx = getelementptr inbounds i8, ptr addrspace(1) %a, i64 %retval.0.i
@@ -248,17 +1349,236 @@ entry:
 ; had a chance to form mul24. The mul combine would then see
 ; extractelement with no known bits and fail. All of the mul/add
 ; combos in this loop should form v_mad_u32_u24.
-
-; FUNC-LABEL: {{^}}mad24_known_bits_destroyed:
-; GCN: v_mad_u32_u24
-; GCN: v_mad_u32_u24
-; GCN: v_mad_u32_u24
-; GCN: v_mad_u32_u24
-; GCN: v_mad_u32_u24
-; GCN: v_mad_u32_u24
-; GCN: v_mad_u32_u24
-; GCN: v_mad_u32_u24
 define void @mad24_known_bits_destroyed(i32 %arg, <4 x i32> %arg1, <4 x i32> %arg2, <4 x i32> %arg3, i32 %arg4, i32 %arg5, i32 %arg6, ptr addrspace(1) %arg7, ptr addrspace(1) %arg8) #0 {
+; EG-LABEL: mad24_known_bits_destroyed:
+; EG:       ; %bb.0: ; %bb
+; EG-NEXT:    ALU 21, @12, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    LOOP_START_DX10 @11
+; EG-NEXT:    ALU 8, @34, KC0[], KC1[]
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T2.X, 0
+; EG-NEXT:    ALU 14, @43, KC0[], KC1[]
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 0
+; EG-NEXT:    ALU_PUSH_BEFORE 3, @58, KC0[], KC1[]
+; EG-NEXT:    JUMP @10 POP:1
+; EG-NEXT:    LOOP_BREAK @10
+; EG-NEXT:    POP @10 POP:1
+; EG-NEXT:    END_LOOP @2
+; EG-NEXT:    CF_END
+; EG-NEXT:    ALU clause starting at 12:
+; EG-NEXT:     MOV * T0.W, KC0[5].X,
+; EG-NEXT:     MOV * T0.Z, KC0[4].W,
+; EG-NEXT:     MOV * T0.Y, KC0[4].Z,
+; EG-NEXT:     MOV T0.X, KC0[2].Y,
+; EG-NEXT:     AND_INT * T1.Y, KC0[4].X, literal.x,
+; EG-NEXT:    16777215(2.350989e-38), 0(0.000000e+00)
+; EG-NEXT:     AND_INT T1.Z, KC0[3].W, literal.x,
+; EG-NEXT:     AND_INT T1.W, KC0[3].Z, literal.x,
+; EG-NEXT:     MOV * T2.W, KC0[7].Y,
+; EG-NEXT:    16777215(2.350989e-38), 0(0.000000e+00)
+; EG-NEXT:     LSHR T1.X, PS, literal.x,
+; EG-NEXT:     AND_INT T2.Y, KC0[6].Y, literal.y,
+; EG-NEXT:     MOV T2.Z, KC0[6].X,
+; EG-NEXT:     MOV * T2.W, KC0[5].W,
+; EG-NEXT:    2(2.802597e-45), 16777215(2.350989e-38)
+; EG-NEXT:     MOV * T3.W, KC0[7].X,
+; EG-NEXT:     LSHR T2.X, PV.W, literal.x,
+; EG-NEXT:     MOV T3.Y, KC0[5].Z,
+; EG-NEXT:     MOV T3.Z, KC0[6].Z,
+; EG-NEXT:     MOV * T3.W, KC0[6].W,
+; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:     MOV * T4.W, KC0[4].Y,
+; EG-NEXT:    ALU clause starting at 34:
+; EG-NEXT:     MULLO_INT * T0.X, T0.X, T2.Y,
+; EG-NEXT:     ADD_INT * T4.W, PS, T3.Z,
+; EG-NEXT:     AND_INT * T4.W, PV.W, literal.x,
+; EG-NEXT:    16777215(2.350989e-38), 0(0.000000e+00)
+; EG-NEXT:     MULLO_INT * T0.X, PV.W, T2.Y,
+; EG-NEXT:     MULLO_INT * T0.W, T0.W, T1.Y,
+; EG-NEXT:     MULLO_INT * T0.Z, T0.Z, T1.Z,
+; EG-NEXT:     MULLO_INT * T0.Y, T0.Y, T1.W,
+; EG-NEXT:     ADD_INT * T0.X, T0.X, T3.Z,
+; EG-NEXT:    ALU clause starting at 43:
+; EG-NEXT:     ADD_INT * T4.W, T0.Y, T3.Y,
+; EG-NEXT:     AND_INT T4.W, PV.W, literal.x,
+; EG-NEXT:     ADD_INT * T5.W, T0.Z, T2.W,
+; EG-NEXT:    16777215(2.350989e-38), 0(0.000000e+00)
+; EG-NEXT:     AND_INT T0.Z, PS, literal.x,
+; EG-NEXT:     ADD_INT T0.W, T0.W, T2.Z,
+; EG-NEXT:     MULLO_INT * T0.Y, PV.W, T1.W,
+; EG-NEXT:    16777215(2.350989e-38), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT T0.Y, PS, T3.Y,
+; EG-NEXT:     AND_INT T0.W, PV.W, literal.x,
+; EG-NEXT:     MULLO_INT * T0.Z, PV.Z, T1.Z,
+; EG-NEXT:    16777215(2.350989e-38), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT T0.Z, PS, T2.W,
+; EG-NEXT:     MULLO_INT * T0.W, PV.W, T1.Y,
+; EG-NEXT:     ADD_INT * T0.W, PS, T2.Z,
+; EG-NEXT:    ALU clause starting at 58:
+; EG-NEXT:     ADD_INT * T3.W, T3.W, literal.x,
+; EG-NEXT:    -1(nan), 0(0.000000e+00)
+; EG-NEXT:     SETE_INT * T4.W, PV.W, 0.0,
+; EG-NEXT:     PRED_SETNE_INT * ExecMask,PredicateBit (MASKED), PV.W, 0.0,
+;
+; CM-LABEL: mad24_known_bits_destroyed:
+; CM:       ; %bb.0: ; %bb
+; CM-NEXT:    ALU 22, @12, KC0[CB0:0-32], KC1[]
+; CM-NEXT:    LOOP_START_DX10 @11
+; CM-NEXT:    ALU 23, @35, KC0[], KC1[]
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0.X, T2.X
+; CM-NEXT:    ALU 23, @59, KC0[], KC1[]
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0, T1.X
+; CM-NEXT:    ALU_PUSH_BEFORE 3, @83, KC0[], KC1[]
+; CM-NEXT:    JUMP @10 POP:1
+; CM-NEXT:    LOOP_BREAK @10
+; CM-NEXT:    POP @10 POP:1
+; CM-NEXT:    END_LOOP @2
+; CM-NEXT:    CF_END
+; CM-NEXT:    ALU clause starting at 12:
+; CM-NEXT:     MOV * T0.W, KC0[5].X,
+; CM-NEXT:     MOV * T0.Z, KC0[4].W,
+; CM-NEXT:     MOV * T0.Y, KC0[4].Z,
+; CM-NEXT:     MOV T0.X, KC0[2].Y,
+; CM-NEXT:     AND_INT * T1.Y, KC0[4].X, literal.x,
+; CM-NEXT:    16777215(2.350989e-38), 0(0.000000e+00)
+; CM-NEXT:     AND_INT T1.Z, KC0[3].W, literal.x,
+; CM-NEXT:     AND_INT * T1.W, KC0[3].Z, literal.x,
+; CM-NEXT:    16777215(2.350989e-38), 0(0.000000e+00)
+; CM-NEXT:     AND_INT T2.Y, KC0[6].Y, literal.x,
+; CM-NEXT:     MOV T2.Z, KC0[6].X,
+; CM-NEXT:     MOV * T2.W, KC0[7].Y,
+; CM-NEXT:    16777215(2.350989e-38), 0(0.000000e+00)
+; CM-NEXT:     LSHR T1.X, PV.W, literal.x,
+; CM-NEXT:     MOV T3.Y, KC0[5].W,
+; CM-NEXT:     MOV T3.Z, KC0[5].Z,
+; CM-NEXT:     MOV * T2.W, KC0[7].X,
+; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; CM-NEXT:     LSHR T2.X, PV.W, literal.x,
+; CM-NEXT:     MOV T4.Y, KC0[6].Z,
+; CM-NEXT:     MOV T4.Z, KC0[6].W,
+; CM-NEXT:     MOV * T2.W, KC0[4].Y,
+; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; CM-NEXT:    ALU clause starting at 35:
+; CM-NEXT:     MULLO_INT T0.X, T0.X, T2.Y,
+; CM-NEXT:     MULLO_INT T0.Y (MASKED), T0.X, T2.Y,
+; CM-NEXT:     MULLO_INT T0.Z (MASKED), T0.X, T2.Y,
+; CM-NEXT:     MULLO_INT * T0.W (MASKED), T0.X, T2.Y,
+; CM-NEXT:     ADD_INT * T2.W, PV.X, T4.Y,
+; CM-NEXT:     AND_INT * T2.W, PV.W, literal.x,
+; CM-NEXT:    16777215(2.350989e-38), 0(0.000000e+00)
+; CM-NEXT:     MULLO_INT T0.X, T2.W, T2.Y,
+; CM-NEXT:     MULLO_INT T0.Y (MASKED), T2.W, T2.Y,
+; CM-NEXT:     MULLO_INT T0.Z (MASKED), T2.W, T2.Y,
+; CM-NEXT:     MULLO_INT * T0.W (MASKED), T2.W, T2.Y,
+; CM-NEXT:     MULLO_INT T0.X (MASKED), T0.W, T1.Y,
+; CM-NEXT:     MULLO_INT T0.Y (MASKED), T0.W, T1.Y,
+; CM-NEXT:     MULLO_INT T0.Z (MASKED), T0.W, T1.Y,
+; CM-NEXT:     MULLO_INT * T0.W, T0.W, T1.Y,
+; CM-NEXT:     MULLO_INT T0.X (MASKED), T0.Z, T1.Z,
+; CM-NEXT:     MULLO_INT T0.Y (MASKED), T0.Z, T1.Z,
+; CM-NEXT:     MULLO_INT T0.Z, T0.Z, T1.Z,
+; CM-NEXT:     MULLO_INT * T0.W (MASKED), T0.Z, T1.Z,
+; CM-NEXT:     MULLO_INT T0.X (MASKED), T0.Y, T1.W,
+; CM-NEXT:     MULLO_INT T0.Y, T0.Y, T1.W,
+; CM-NEXT:     MULLO_INT T0.Z (MASKED), T0.Y, T1.W,
+; CM-NEXT:     MULLO_INT * T0.W (MASKED), T0.Y, T1.W,
+; CM-NEXT:     ADD_INT * T0.X, T0.X, T4.Y,
+; CM-NEXT:    ALU clause starting at 59:
+; CM-NEXT:     ADD_INT * T2.W, T0.Y, T3.Z,
+; CM-NEXT:     ADD_INT T0.Z, T0.Z, T3.Y,
+; CM-NEXT:     AND_INT * T2.W, PV.W, literal.x,
+; CM-NEXT:    16777215(2.350989e-38), 0(0.000000e+00)
+; CM-NEXT:     MULLO_INT T0.X (MASKED), T2.W, T1.W,
+; CM-NEXT:     MULLO_INT T0.Y, T2.W, T1.W,
+; CM-NEXT:     MULLO_INT T0.Z (MASKED), T2.W, T1.W,
+; CM-NEXT:     MULLO_INT * T0.W (MASKED), T2.W, T1.W,
+; CM-NEXT:     ADD_INT T0.Y, PV.Y, T3.Z,
+; CM-NEXT:     ADD_INT T5.Z, T0.W, T2.Z, BS:VEC_021/SCL_122
+; CM-NEXT:     AND_INT * T0.W, T0.Z, literal.x,
+; CM-NEXT:    16777215(2.350989e-38), 0(0.000000e+00)
+; CM-NEXT:     MULLO_INT T0.X (MASKED), T0.W, T1.Z,
+; CM-NEXT:     MULLO_INT T0.Y (MASKED), T0.W, T1.Z,
+; CM-NEXT:     MULLO_INT T0.Z, T0.W, T1.Z,
+; CM-NEXT:     MULLO_INT * T0.W (MASKED), T0.W, T1.Z,
+; CM-NEXT:     ADD_INT T0.Z, PV.Z, T3.Y,
+; CM-NEXT:     AND_INT * T0.W, T5.Z, literal.x,
+; CM-NEXT:    16777215(2.350989e-38), 0(0.000000e+00)
+; CM-NEXT:     MULLO_INT T0.X (MASKED), T0.W, T1.Y,
+; CM-NEXT:     MULLO_INT T0.Y (MASKED), T0.W, T1.Y,
+; CM-NEXT:     MULLO_INT T0.Z (MASKED), T0.W, T1.Y,
+; CM-NEXT:     MULLO_INT * T0.W, T0.W, T1.Y,
+; CM-NEXT:     ADD_INT * T0.W, PV.W, T2.Z,
+; CM-NEXT:    ALU clause starting at 83:
+; CM-NEXT:     ADD_INT * T4.Z, T4.Z, literal.x,
+; CM-NEXT:    -1(nan), 0(0.000000e+00)
+; CM-NEXT:     SETE_INT * T2.W, PV.Z, 0.0,
+; CM-NEXT:     PRED_SETNE_INT * ExecMask,PredicateBit (MASKED), PV.W, 0.0,
+;
+; GCN-LABEL: mad24_known_bits_destroyed:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v5, v0
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffffff, v13
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffffff, v2
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffffff, v3
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffffff, v4
+; GCN-NEXT:    s_mov_b64 s[8:9], 0
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s4, s6
+; GCN-NEXT:    s_mov_b32 s5, s6
+; GCN-NEXT:  .LBB9_1: ; %bb19
+; GCN-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GCN-NEXT:    v_mad_u32_u24 v4, v5, v0, v14
+; GCN-NEXT:    s_waitcnt expcnt(0)
+; GCN-NEXT:    v_mad_u32_u24 v6, v6, v1, v10
+; GCN-NEXT:    v_mad_u32_u24 v7, v7, v2, v11
+; GCN-NEXT:    v_mad_u32_u24 v8, v8, v3, v12
+; GCN-NEXT:    v_add_i32_e32 v15, vcc, -1, v15
+; GCN-NEXT:    v_mad_u32_u24 v5, v4, v0, v14
+; GCN-NEXT:    v_mad_u32_u24 v6, v6, v1, v10
+; GCN-NEXT:    v_mad_u32_u24 v7, v7, v2, v11
+; GCN-NEXT:    v_mad_u32_u24 v8, v8, v3, v12
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v15
+; GCN-NEXT:    buffer_store_dword v5, v[16:17], s[4:7], 0 addr64
+; GCN-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GCN-NEXT:    buffer_store_dwordx4 v[5:8], v[18:19], s[4:7], 0 addr64
+; GCN-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GCN-NEXT:    s_cbranch_execnz .LBB9_1
+; GCN-NEXT:  ; %bb.2: ; %bb18
+; GCN-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: mad24_known_bits_destroyed:
+; GFX8:       ; %bb.0: ; %bb
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v5, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffffff, v13
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffffff, v2
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffffff, v3
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffffff, v4
+; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:  .LBB9_1: ; %bb19
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    v_add_u32_e32 v15, vcc, -1, v15
+; GFX8-NEXT:    v_mad_u32_u24 v4, v5, v0, v14
+; GFX8-NEXT:    v_mad_u32_u24 v6, v6, v1, v10
+; GFX8-NEXT:    v_mad_u32_u24 v7, v7, v2, v11
+; GFX8-NEXT:    v_mad_u32_u24 v8, v8, v3, v12
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v15
+; GFX8-NEXT:    v_mad_u32_u24 v5, v4, v0, v14
+; GFX8-NEXT:    v_mad_u32_u24 v6, v6, v1, v10
+; GFX8-NEXT:    v_mad_u32_u24 v7, v7, v2, v11
+; GFX8-NEXT:    v_mad_u32_u24 v8, v8, v3, v12
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    flat_store_dword v[16:17], v5
+; GFX8-NEXT:    flat_store_dwordx4 v[18:19], v[5:8]
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB9_1
+; GFX8-NEXT:  ; %bb.2: ; %bb18
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
 bb:
   %tmp = and i32 %arg4, 16777215
   %tmp9 = extractelement <4 x i32> %arg1, i64 1
diff --git a/llvm/test/CodeGen/WebAssembly/fpclamptosat_vec.ll b/llvm/test/CodeGen/WebAssembly/fpclamptosat_vec.ll
index 52f57dc..a8d37be 100644
--- a/llvm/test/CodeGen/WebAssembly/fpclamptosat_vec.ll
+++ b/llvm/test/CodeGen/WebAssembly/fpclamptosat_vec.ll
@@ -434,7 +434,6 @@ entry:
 define <8 x i16> @stest_f16i16(<8 x half> %x) {
 ; CHECK-LABEL: stest_f16i16:
 ; CHECK:         .functype stest_f16i16 (f32, f32, f32, f32, f32, f32, f32, f32) -> (v128)
-; CHECK-NEXT:    .local v128, v128, v128
 ; CHECK-NEXT:  # %bb.0: # %entry
 ; CHECK-NEXT:    local.get 5
 ; CHECK-NEXT:    call __truncsfhf2
@@ -474,15 +473,6 @@ define <8 x i16> @stest_f16i16(<8 x half> %x) {
 ; CHECK-NEXT:    call __extendhfsf2
 ; CHECK-NEXT:    i32.trunc_sat_f32_s
 ; CHECK-NEXT:    i32x4.replace_lane 3
-; CHECK-NEXT:    v128.const 32767, 32767, 32767, 32767
-; CHECK-NEXT:    local.tee 8
-; CHECK-NEXT:    i32x4.min_s
-; CHECK-NEXT:    v128.const -32768, -32768, -32768, -32768
-; CHECK-NEXT:    local.tee 9
-; CHECK-NEXT:    i32x4.max_s
-; CHECK-NEXT:    v128.const 65535, 65535, 65535, 65535
-; CHECK-NEXT:    local.tee 10
-; CHECK-NEXT:    v128.and
 ; CHECK-NEXT:    local.get 4
 ; CHECK-NEXT:    i32.trunc_sat_f32_s
 ; CHECK-NEXT:    i32x4.splat
@@ -495,13 +485,7 @@ define <8 x i16> @stest_f16i16(<8 x half> %x) {
 ; CHECK-NEXT:    local.get 7
 ; CHECK-NEXT:    i32.trunc_sat_f32_s
 ; CHECK-NEXT:    i32x4.replace_lane 3
-; CHECK-NEXT:    local.get 8
-; CHECK-NEXT:    i32x4.min_s
-; CHECK-NEXT:    local.get 9
-; CHECK-NEXT:    i32x4.max_s
-; CHECK-NEXT:    local.get 10
-; CHECK-NEXT:    v128.and
-; CHECK-NEXT:    i16x8.narrow_i32x4_u
+; CHECK-NEXT:    i16x8.narrow_i32x4_s
 ; CHECK-NEXT:    # fallthrough-return
 entry:
   %conv = fptosi <8 x half> %x to <8 x i32>
@@ -516,7 +500,6 @@ entry:
 define <8 x i16> @utest_f16i16(<8 x half> %x) {
 ; CHECK-LABEL: utest_f16i16:
 ; CHECK:         .functype utest_f16i16 (f32, f32, f32, f32, f32, f32, f32, f32) -> (v128)
-; CHECK-NEXT:    .local v128
 ; CHECK-NEXT:  # %bb.0: # %entry
 ; CHECK-NEXT:    local.get 5
 ; CHECK-NEXT:    call __truncsfhf2
@@ -556,9 +539,6 @@ define <8 x i16> @utest_f16i16(<8 x half> %x) {
 ; CHECK-NEXT:    call __extendhfsf2
 ; CHECK-NEXT:    i32.trunc_sat_f32_u
 ; CHECK-NEXT:    i32x4.replace_lane 3
-; CHECK-NEXT:    v128.const 65535, 65535, 65535, 65535
-; CHECK-NEXT:    local.tee 8
-; CHECK-NEXT:    i32x4.min_u
 ; CHECK-NEXT:    local.get 4
 ; CHECK-NEXT:    i32.trunc_sat_f32_u
 ; CHECK-NEXT:    i32x4.splat
@@ -571,8 +551,6 @@ define <8 x i16> @utest_f16i16(<8 x half> %x) {
 ; CHECK-NEXT:    local.get 7
 ; CHECK-NEXT:    i32.trunc_sat_f32_u
 ; CHECK-NEXT:    i32x4.replace_lane 3
-; CHECK-NEXT:    local.get 8
-; CHECK-NEXT:    i32x4.min_u
 ; CHECK-NEXT:    i16x8.narrow_i32x4_u
 ; CHECK-NEXT:    # fallthrough-return
 entry:
@@ -1861,7 +1839,6 @@ entry:
 define <8 x i16> @stest_f16i16_mm(<8 x half> %x) {
 ; CHECK-LABEL: stest_f16i16_mm:
 ; CHECK:         .functype stest_f16i16_mm (f32, f32, f32, f32, f32, f32, f32, f32) -> (v128)
-; CHECK-NEXT:    .local v128, v128, v128
 ; CHECK-NEXT:  # %bb.0: # %entry
 ; CHECK-NEXT:    local.get 5
 ; CHECK-NEXT:    call __truncsfhf2
@@ -1901,15 +1878,6 @@ define <8 x i16> @stest_f16i16_mm(<8 x half> %x) {
 ; CHECK-NEXT:    call __extendhfsf2
 ; CHECK-NEXT:    i32.trunc_sat_f32_s
 ; CHECK-NEXT:    i32x4.replace_lane 3
-; CHECK-NEXT:    v128.const 32767, 32767, 32767, 32767
-; CHECK-NEXT:    local.tee 8
-; CHECK-NEXT:    i32x4.min_s
-; CHECK-NEXT:    v128.const -32768, -32768, -32768, -32768
-; CHECK-NEXT:    local.tee 9
-; CHECK-NEXT:    i32x4.max_s
-; CHECK-NEXT:    v128.const 65535, 65535, 65535, 65535
-; CHECK-NEXT:    local.tee 10
-; CHECK-NEXT:    v128.and
 ; CHECK-NEXT:    local.get 4
 ; CHECK-NEXT:    i32.trunc_sat_f32_s
 ; CHECK-NEXT:    i32x4.splat
@@ -1922,13 +1890,7 @@ define <8 x i16> @stest_f16i16_mm(<8 x half> %x) {
 ; CHECK-NEXT:    local.get 7
 ; CHECK-NEXT:    i32.trunc_sat_f32_s
 ; CHECK-NEXT:    i32x4.replace_lane 3
-; CHECK-NEXT:    local.get 8
-; CHECK-NEXT:    i32x4.min_s
-; CHECK-NEXT:    local.get 9
-; CHECK-NEXT:    i32x4.max_s
-; CHECK-NEXT:    local.get 10
-; CHECK-NEXT:    v128.and
-; CHECK-NEXT:    i16x8.narrow_i32x4_u
+; CHECK-NEXT:    i16x8.narrow_i32x4_s
 ; CHECK-NEXT:    # fallthrough-return
 entry:
   %conv = fptosi <8 x half> %x to <8 x i32>
@@ -1941,7 +1903,6 @@ entry:
 define <8 x i16> @utest_f16i16_mm(<8 x half> %x) {
 ; CHECK-LABEL: utest_f16i16_mm:
 ; CHECK:         .functype utest_f16i16_mm (f32, f32, f32, f32, f32, f32, f32, f32) -> (v128)
-; CHECK-NEXT:    .local v128
 ; CHECK-NEXT:  # %bb.0: # %entry
 ; CHECK-NEXT:    local.get 5
 ; CHECK-NEXT:    call __truncsfhf2
@@ -1981,9 +1942,6 @@ define <8 x i16> @utest_f16i16_mm(<8 x half> %x) {
 ; CHECK-NEXT:    call __extendhfsf2
 ; CHECK-NEXT:    i32.trunc_sat_f32_u
 ; CHECK-NEXT:    i32x4.replace_lane 3
-; CHECK-NEXT:    v128.const 65535, 65535, 65535, 65535
-; CHECK-NEXT:    local.tee 8
-; CHECK-NEXT:    i32x4.min_u
 ; CHECK-NEXT:    local.get 4
 ; CHECK-NEXT:    i32.trunc_sat_f32_u
 ; CHECK-NEXT:    i32x4.splat
@@ -1996,8 +1954,6 @@ define <8 x i16> @utest_f16i16_mm(<8 x half> %x) {
 ; CHECK-NEXT:    local.get 7
 ; CHECK-NEXT:    i32.trunc_sat_f32_u
 ; CHECK-NEXT:    i32x4.replace_lane 3
-; CHECK-NEXT:    local.get 8
-; CHECK-NEXT:    i32x4.min_u
 ; CHECK-NEXT:    i16x8.narrow_i32x4_u
 ; CHECK-NEXT:    # fallthrough-return
 entry:
diff --git a/llvm/test/CodeGen/WebAssembly/saturating-truncation.ll b/llvm/test/CodeGen/WebAssembly/saturating-truncation.ll
new file mode 100644
index 0000000..f3f3ba9
--- /dev/null
+++ b/llvm/test/CodeGen/WebAssembly/saturating-truncation.ll
@@ -0,0 +1,87 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+
+; RUN: llc < %s -verify-machineinstrs -mattr=+simd128 | FileCheck %s
+
+target triple = "wasm32-unknown-unknown"
+
+declare <8 x i32> @llvm.smin.v8i32(<8 x i32>, <8 x i32>) #2
+declare <8 x i32> @llvm.smax.v8i32(<8 x i32>, <8 x i32>) #2
+
+define <16 x i8> @i16_signed(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: i16_signed:
+; CHECK:         .functype i16_signed (v128, v128) -> (v128)
+; CHECK-NEXT:  # %bb.0: # %bb2
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i8x16.narrow_i16x8_s
+; CHECK-NEXT:    # fallthrough-return
+bb2:
+  %0 = shufflevector <8 x i16> %a, <8 x i16> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %1 = tail call <16 x i16> @llvm.smax.v16i16(<16 x i16> %0, <16 x i16> splat (i16 -128))
+  %2 = tail call <16 x i16> @llvm.smin.v16i16(<16 x i16> %1, <16 x i16> splat (i16 127))
+  %3 = trunc nsw <16 x i16> %2 to <16 x i8>
+  ret <16 x i8> %3
+  ret <16 x i8> %3
+}
+
+define <8 x i16> @i32_signed(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: i32_signed:
+; CHECK:         .functype i32_signed (v128, v128) -> (v128)
+; CHECK-NEXT:  # %bb.0: # %bb2
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i16x8.narrow_i32x4_s
+; CHECK-NEXT:    # fallthrough-return
+bb2:
+  %0 = shufflevector <4 x i32> %a, <4 x i32> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %1 = tail call <8 x i32> @llvm.smax.v8i32(<8 x i32> %0, <8 x i32> splat (i32 -32768))
+  %2 = tail call <8 x i32> @llvm.smin.v8i32(<8 x i32> %1, <8 x i32> splat (i32 32767))
+  %3 = trunc nsw <8 x i32> %2 to <8 x i16>
+  ret <8 x i16> %3
+}
+
+define <8 x i16> @i32_signed_flipped(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: i32_signed_flipped:
+; CHECK:         .functype i32_signed_flipped (v128, v128) -> (v128)
+; CHECK-NEXT:  # %bb.0: # %bb2
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i16x8.narrow_i32x4_s
+; CHECK-NEXT:    # fallthrough-return
+bb2:
+  %0 = shufflevector <4 x i32> %a, <4 x i32> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %1 = tail call <8 x i32> @llvm.smin.v8i32(<8 x i32> splat (i32 32767), <8 x i32> %0)
+  %2 = tail call <8 x i32> @llvm.smax.v8i32(<8 x i32> splat (i32 -32768), <8 x i32> %1)
+  %3 = trunc nsw <8 x i32> %2 to <8 x i16>
+  ret <8 x i16> %3
+}
+
+define <16 x i8> @i16_unsigned(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: i16_unsigned:
+; CHECK:         .functype i16_unsigned (v128, v128) -> (v128)
+; CHECK-NEXT:  # %bb.0: # %bb2
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i8x16.narrow_i16x8_u
+; CHECK-NEXT:    # fallthrough-return
+bb2:
+  %0 = shufflevector <8 x i16> %a, <8 x i16> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %1 = tail call <16 x i16> @llvm.umin.v16i16(<16 x i16> %0, <16 x i16> splat (i16 255))
+  %2 = trunc nuw <16 x i16> %1 to <16 x i8>
+  ret <16 x i8> %2
+}
+
+define <8 x i16> @i32_unsigned(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: i32_unsigned:
+; CHECK:         .functype i32_unsigned (v128, v128) -> (v128)
+; CHECK-NEXT:  # %bb.0: # %bb2
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i16x8.narrow_i32x4_u
+; CHECK-NEXT:    # fallthrough-return
+bb2:
+  %0 = shufflevector <4 x i32> %a, <4 x i32> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %1 = tail call <8 x i32> @llvm.umin.v8i32(<8 x i32> %0, <8 x i32> splat (i32 65535))
+  %2 = trunc nsw <8 x i32> %1 to <8 x i16>
+  ret <8 x i16> %2
+}
diff --git a/llvm/test/CodeGen/X86/ptrtoaddr-fast-isel.ll b/llvm/test/CodeGen/X86/ptrtoaddr-fast-isel.ll
new file mode 100644
index 0000000..c302d41
--- /dev/null
+++ b/llvm/test/CodeGen/X86/ptrtoaddr-fast-isel.ll
@@ -0,0 +1,11 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=x86_64-linux-gnu -fast-isel -fast-isel-abort=1 < %s -o - | FileCheck %s
+
+define i64 @ptrtoaddr(ptr %p) {
+; CHECK-LABEL: ptrtoaddr:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    retq
+  %addr = ptrtoaddr ptr %p to i64
+  ret i64 %addr
+}
diff --git a/llvm/test/DebugInfo/X86/instr-ref-opt-bisect2.ll b/llvm/test/DebugInfo/X86/instr-ref-opt-bisect2.ll
new file mode 100644
index 0000000..92aedfe
--- /dev/null
+++ b/llvm/test/DebugInfo/X86/instr-ref-opt-bisect2.ll
@@ -0,0 +1,36 @@
+; RUN: llc %s -o - -stop-after=livedebugvalues -opt-bisect-limit=1 | FileCheck %s
+; RUN: llc %s -o - -stop-after=livedebugvalues -opt-bisect-limit=10 | FileCheck %s
+; RUN: llc %s -o - -stop-after=livedebugvalues -opt-bisect-limit=100 | FileCheck %s
+
+; RUN: llc %s -o - -stop-after=livedebugvalues -opt-bisect-limit=1 -fast-isel=true | FileCheck %s
+; RUN: llc %s -o - -stop-after=livedebugvalues -opt-bisect-limit=10 -fast-isel=true | FileCheck %s
+; RUN: llc %s -o - -stop-after=livedebugvalues -opt-bisect-limit=100 -fast-isel=true | FileCheck %s
+
+; This test has the same purpose as the instr-ref-opt-bisect.ll, to check if
+; during opt-bisect's optimisation level change we won't run into an assert.
+; This is simply testing different IR.
+
+; CHECK: DBG_VALUE
+
+target triple = "x86_64-pc-windows-msvc"
+
+define i1 @foo(i32 %arg) !dbg !3 {
+entry:
+    #dbg_value(i32 %arg, !4, !DIExpression(), !5)
+  switch i32 %arg, label %bb [
+    i32 810, label %bb
+  ], !dbg !5
+bb:
+  %a = load volatile i1, ptr null, align 1
+  ret i1 false
+}
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1)
+!1 = !DIFile(filename: "instr-ref-opt-bisect2.ll", directory: ".")
+!2 = !{i32 2, !"Debug Info Version", i32 3}
+!3 = distinct !DISubprogram(name: "instr-ref-opt-bisect2", file: !1, unit: !0)
+!4 = !DILocalVariable(name: "arg", arg: 2, scope: !3)
+!5 = !DILocation(line: 0, scope: !3)
diff --git a/llvm/test/MC/AArch64/armv9a-sysp-diagnostics.s b/llvm/test/MC/AArch64/armv9a-sysp-diagnostics.s
new file mode 100644
index 0000000..f8baf37
--- /dev/null
+++ b/llvm/test/MC/AArch64/armv9a-sysp-diagnostics.s
@@ -0,0 +1,95 @@
+// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ERROR
+
+tlbip ALLE1
+// CHECK-ERROR: error: invalid operand for TLBIP instruction
+tlbip ALLE1IS
+// CHECK-ERROR: error: invalid operand for TLBIP instruction
+tlbip ALLE1ISNXS
+// CHECK-ERROR: error: invalid operand for TLBIP instruction
+tlbip ALLE1NXS
+// CHECK-ERROR: error: invalid operand for TLBIP instruction
+tlbip ALLE1OS
+// CHECK-ERROR: error: invalid operand for TLBIP instruction
+tlbip ALLE1OSNXS
+// CHECK-ERROR: error: invalid operand for TLBIP instruction
+tlbip ALLE2
+// CHECK-ERROR: error: invalid operand for TLBIP instruction
+tlbip ALLE2IS
+// CHECK-ERROR: error: invalid operand for TLBIP instruction
+tlbip ALLE2ISNXS
+// CHECK-ERROR: error: invalid operand for TLBIP instruction
+tlbip ALLE2NXS
+// CHECK-ERROR: error: invalid operand for TLBIP instruction
+tlbip ALLE2OS
+// CHECK-ERROR: error: invalid operand for TLBIP instruction
+tlbip ALLE2OSNXS
+// CHECK-ERROR: error: invalid operand for TLBIP instruction
+tlbip ALLE3
+// CHECK-ERROR: error: invalid operand for TLBIP instruction
+tlbip ALLE3IS
+// CHECK-ERROR: error: invalid operand for TLBIP instruction
+tlbip ALLE3ISNXS
+// CHECK-ERROR: error: invalid operand for TLBIP instruction
+tlbip ALLE3NXS
+// CHECK-ERROR: error: invalid operand for TLBIP instruction
+tlbip ALLE3OS
+// CHECK-ERROR: error: invalid operand for TLBIP instruction
+tlbip ALLE3OSNXS
+// CHECK-ERROR: error: invalid operand for TLBIP instruction
+tlbip ASIDE1
+// CHECK-ERROR: error: invalid operand for TLBIP instruction
+tlbip ASIDE1IS
+// CHECK-ERROR: error: invalid operand for TLBIP instruction
+tlbip ASIDE1ISNXS
+// CHECK-ERROR: error: invalid operand for TLBIP instruction
+tlbip ASIDE1NXS
+// CHECK-ERROR: error: invalid operand for TLBIP instruction
+tlbip ASIDE1OS
+// CHECK-ERROR: error: invalid operand for TLBIP instruction
+tlbip ASIDE1OSNXS
+// CHECK-ERROR: error: invalid operand for TLBIP instruction
+tlbip PAALL
+// CHECK-ERROR: error: invalid operand for TLBIP instruction
+tlbip PAALLOS
+// CHECK-ERROR: error: invalid operand for TLBIP instruction
+tlbip RPALOS
+// CHECK-ERROR: error: invalid operand for TLBIP instruction
+tlbip RPAOS
+// CHECK-ERROR: error: invalid operand for TLBIP instruction
+tlbip VMALLE1
+// CHECK-ERROR: error: invalid operand for TLBIP instruction
+tlbip VMALLE1IS
+// CHECK-ERROR: error: invalid operand for TLBIP instruction
+tlbip VMALLE1ISNXS
+// CHECK-ERROR: error: invalid operand for TLBIP instruction
+tlbip VMALLE1NXS
+// CHECK-ERROR: error: invalid operand for TLBIP instruction
+tlbip VMALLE1OS
+// CHECK-ERROR: error: invalid operand for TLBIP instruction
+tlbip VMALLE1OSNXS
+// CHECK-ERROR: error: invalid operand for TLBIP instruction
+tlbip VMALLS12E1
+// CHECK-ERROR: error: invalid operand for TLBIP instruction
+tlbip VMALLS12E1IS
+// CHECK-ERROR: error: invalid operand for TLBIP instruction
+tlbip VMALLS12E1ISNXS
+// CHECK-ERROR: error: invalid operand for TLBIP instruction
+tlbip VMALLS12E1NXS
+// CHECK-ERROR: error: invalid operand for TLBIP instruction
+tlbip VMALLS12E1OS
+// CHECK-ERROR: error: invalid operand for TLBIP instruction
+tlbip VMALLS12E1OSNXS
+// CHECK-ERROR: error: invalid operand for TLBIP instruction
+tlbip VMALLWS2E1
+// CHECK-ERROR: error: invalid operand for TLBIP instruction
+tlbip VMALLWS2E1IS
+// CHECK-ERROR: error: invalid operand for TLBIP instruction
+tlbip VMALLWS2E1ISNXS
+// CHECK-ERROR: error: invalid operand for TLBIP instruction
+tlbip VMALLWS2E1NXS
+// CHECK-ERROR: error: invalid operand for TLBIP instruction
+tlbip VMALLWS2E1OS
+// CHECK-ERROR: error: invalid operand for TLBIP instruction
+tlbip VMALLWS2E1OSNXS
+// CHECK-ERROR: error: invalid operand for TLBIP instruction
diff --git a/llvm/test/Other/new-pm-print-pipeline.ll b/llvm/test/Other/new-pm-print-pipeline.ll
index 6fa57f1..3536932 100644
--- a/llvm/test/Other/new-pm-print-pipeline.ll
+++ b/llvm/test/Other/new-pm-print-pipeline.ll
@@ -50,7 +50,7 @@
 ; CHECK-17: function(print<stack-lifetime><may>,print<stack-lifetime><must>)
 
 ; RUN: opt -disable-output -disable-verify -print-pipeline-passes -passes='function(simplifycfg<bonus-inst-threshold=5;forward-switch-cond;switch-to-lookup;keep-loops;hoist-common-insts;hoist-loads-stores-with-cond-faulting;sink-common-insts;speculate-blocks;simplify-cond-branch;speculate-unpredictables>,simplifycfg<bonus-inst-threshold=7;no-forward-switch-cond;no-switch-to-lookup;no-keep-loops;no-hoist-common-insts;no-hoist-loads-stores-with-cond-faulting;no-sink-common-insts;no-speculate-blocks;no-simplify-cond-branch;no-speculate-unpredictables>)' < %s | FileCheck %s --match-full-lines --check-prefixes=CHECK-18
-; CHECK-18: function(simplifycfg<bonus-inst-threshold=5;forward-switch-cond;no-switch-range-to-icmp;switch-to-lookup;keep-loops;hoist-common-insts;hoist-loads-stores-with-cond-faulting;sink-common-insts;speculate-blocks;simplify-cond-branch;speculate-unpredictables>,simplifycfg<bonus-inst-threshold=7;no-forward-switch-cond;no-switch-range-to-icmp;no-switch-to-lookup;no-keep-loops;no-hoist-common-insts;no-hoist-loads-stores-with-cond-faulting;no-sink-common-insts;no-speculate-blocks;no-simplify-cond-branch;no-speculate-unpredictables>)
+; CHECK-18: function(simplifycfg<bonus-inst-threshold=5;forward-switch-cond;no-switch-range-to-icmp;no-switch-to-arithmetic;switch-to-lookup;keep-loops;hoist-common-insts;hoist-loads-stores-with-cond-faulting;sink-common-insts;speculate-blocks;simplify-cond-branch;speculate-unpredictables>,simplifycfg<bonus-inst-threshold=7;no-forward-switch-cond;no-switch-range-to-icmp;no-switch-to-arithmetic;no-switch-to-lookup;no-keep-loops;no-hoist-common-insts;no-hoist-loads-stores-with-cond-faulting;no-sink-common-insts;no-speculate-blocks;no-simplify-cond-branch;no-speculate-unpredictables>)
 
 ; RUN: opt -disable-output -disable-verify -print-pipeline-passes -passes='function(loop-vectorize<no-interleave-forced-only;no-vectorize-forced-only>,loop-vectorize<interleave-forced-only;vectorize-forced-only>)' < %s | FileCheck %s --match-full-lines --check-prefixes=CHECK-19
 ; CHECK-19: function(loop-vectorize<no-interleave-forced-only;no-vectorize-forced-only;>,loop-vectorize<interleave-forced-only;vectorize-forced-only;>)
diff --git a/llvm/test/Transforms/GVN/ptrtoaddr.ll b/llvm/test/Transforms/GVN/ptrtoaddr.ll
new file mode 100644
index 0000000..6d02bc6
--- /dev/null
+++ b/llvm/test/Transforms/GVN/ptrtoaddr.ll
@@ -0,0 +1,30 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -S -passes=gvn < %s | FileCheck %s
+
+define i64 @ptrtoaddr_same(ptr %p) {
+; CHECK-LABEL: define i64 @ptrtoaddr_same(
+; CHECK-SAME: ptr [[P:%.*]]) {
+; CHECK-NEXT:    [[J:%.*]] = ptrtoaddr ptr [[P]] to i64
+; CHECK-NEXT:    ret i64 0
+;
+  %i = ptrtoaddr ptr %p to i64
+  %j = ptrtoaddr ptr %p to i64
+  %sub = sub i64 %i, %j
+  ret i64 %sub
+}
+
+; Note that unlike for ptrtoint, it's not possible for ptrtoaddr to differ
+; in result type for the same input.
+define i64 @ptrtoaddr_different(ptr %p, ptr %p2) {
+; CHECK-LABEL: define i64 @ptrtoaddr_different(
+; CHECK-SAME: ptr [[P:%.*]], ptr [[P2:%.*]]) {
+; CHECK-NEXT:    [[I:%.*]] = ptrtoaddr ptr [[P]] to i64
+; CHECK-NEXT:    [[J:%.*]] = ptrtoaddr ptr [[P2]] to i64
+; CHECK-NEXT:    [[SUB:%.*]] = sub i64 [[I]], [[J]]
+; CHECK-NEXT:    ret i64 [[SUB]]
+;
+  %i = ptrtoaddr ptr %p to i64
+  %j = ptrtoaddr ptr %p2 to i64
+  %sub = sub i64 %i, %j
+  ret i64 %sub
+}
diff --git a/llvm/test/Transforms/InstCombine/fold-selective-shift.ll b/llvm/test/Transforms/InstCombine/fold-selective-shift.ll
new file mode 100644
index 0000000..2b22965
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/fold-selective-shift.ll
@@ -0,0 +1,323 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -passes=instcombine %s -S | FileCheck %s
+
+declare void @clobber.i32(i32)
+
+define i16 @selective_shift_16(i32 %mask, i16 %upper, i16 %lower) {
+; CHECK-LABEL: define i16 @selective_shift_16(
+; CHECK-SAME: i32 [[MASK:%.*]], i16 [[UPPER:%.*]], i16 [[LOWER:%.*]]) {
+; CHECK-NEXT:    [[MASK_BIT:%.*]] = and i32 [[MASK]], 16
+; CHECK-NEXT:    [[MASK_BIT_Z:%.*]] = icmp eq i32 [[MASK_BIT]], 0
+; CHECK-NEXT:    [[SEL_V:%.*]] = select i1 [[MASK_BIT_Z]], i16 [[LOWER]], i16 [[UPPER]]
+; CHECK-NEXT:    ret i16 [[SEL_V]]
+;
+  %upper.zext = zext i16 %upper to i32
+  %upper.shl = shl nuw i32 %upper.zext, 16
+  %lower.zext = zext i16 %lower to i32
+  %pack = or disjoint i32 %upper.shl, %lower.zext
+  %mask.bit = and i32 %mask, 16
+  %sel = lshr i32 %pack, %mask.bit
+  %trunc = trunc i32 %sel to i16
+  ret i16 %trunc
+}
+
+define i16 @selective_shift_16.commute(i32 %mask, i16 %upper, i16 %lower) {
+; CHECK-LABEL: define i16 @selective_shift_16.commute(
+; CHECK-SAME: i32 [[MASK:%.*]], i16 [[UPPER:%.*]], i16 [[LOWER:%.*]]) {
+; CHECK-NEXT:    [[MASK_BIT:%.*]] = and i32 [[MASK]], 16
+; CHECK-NEXT:    [[MASK_BIT_Z:%.*]] = icmp eq i32 [[MASK_BIT]], 0
+; CHECK-NEXT:    [[SEL_V:%.*]] = select i1 [[MASK_BIT_Z]], i16 [[LOWER]], i16 [[UPPER]]
+; CHECK-NEXT:    ret i16 [[SEL_V]]
+;
+  %upper.zext = zext i16 %upper to i32
+  %upper.shl = shl nuw i32 %upper.zext, 16
+  %lower.zext = zext i16 %lower to i32
+  %pack = or disjoint i32 %lower.zext, %upper.shl
+  %mask.bit = and i32 %mask, 16
+  %sel = lshr i32 %pack, %mask.bit
+  %trunc = trunc i32 %sel to i16
+  ret i16 %trunc
+}
+
+define i16 @selective_shift_16.range(i32 %mask, i32 %upper, i32 range(i32 0, 65536) %lower) {
+; CHECK-LABEL: define i16 @selective_shift_16.range(
+; CHECK-SAME: i32 [[MASK:%.*]], i32 [[UPPER:%.*]], i32 range(i32 0, 65536) [[LOWER:%.*]]) {
+; CHECK-NEXT:    [[MASK_BIT:%.*]] = and i32 [[MASK]], 16
+; CHECK-NEXT:    [[MASK_BIT_Z:%.*]] = icmp eq i32 [[MASK_BIT]], 0
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[MASK_BIT_Z]], i32 [[LOWER]], i32 [[UPPER]]
+; CHECK-NEXT:    [[TRUNC:%.*]] = trunc i32 [[SEL]] to i16
+; CHECK-NEXT:    ret i16 [[TRUNC]]
+;
+  %upper.shl = shl nuw i32 %upper, 16
+  %pack = or disjoint i32 %upper.shl, %lower
+  %mask.bit = and i32 %mask, 16
+  %sel = lshr i32 %pack, %mask.bit
+  %trunc = trunc i32 %sel to i16
+  ret i16 %trunc
+}
+
+define i16 @selective_shift_16.range.commute(i32 %mask, i32 %upper, i32 range(i32 0, 65536) %lower) {
+; CHECK-LABEL: define i16 @selective_shift_16.range.commute(
+; CHECK-SAME: i32 [[MASK:%.*]], i32 [[UPPER:%.*]], i32 range(i32 0, 65536) [[LOWER:%.*]]) {
+; CHECK-NEXT:    [[MASK_BIT:%.*]] = and i32 [[MASK]], 16
+; CHECK-NEXT:    [[MASK_BIT_Z:%.*]] = icmp eq i32 [[MASK_BIT]], 0
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[MASK_BIT_Z]], i32 [[LOWER]], i32 [[UPPER]]
+; CHECK-NEXT:    [[TRUNC:%.*]] = trunc i32 [[SEL]] to i16
+; CHECK-NEXT:    ret i16 [[TRUNC]]
+;
+  %upper.shl = shl nuw i32 %upper, 16
+  %pack = or disjoint i32 %lower, %upper.shl
+  %mask.bit = and i32 %mask, 16
+  %sel = lshr i32 %pack, %mask.bit
+  %trunc = trunc i32 %sel to i16
+  ret i16 %trunc
+}
+
+define i32 @selective_shift_16.masked(i32 %mask, i16 %upper, i16 %lower) {
+; CHECK-LABEL: define i32 @selective_shift_16.masked(
+; CHECK-SAME: i32 [[MASK:%.*]], i16 [[UPPER:%.*]], i16 [[LOWER:%.*]]) {
+; CHECK-NEXT:    [[MASK_BIT:%.*]] = and i32 [[MASK]], 16
+; CHECK-NEXT:    [[MASK_BIT_Z:%.*]] = icmp eq i32 [[MASK_BIT]], 0
+; CHECK-NEXT:    [[SEL_V:%.*]] = select i1 [[MASK_BIT_Z]], i16 [[LOWER]], i16 [[UPPER]]
+; CHECK-NEXT:    [[SEL:%.*]] = zext i16 [[SEL_V]] to i32
+; CHECK-NEXT:    ret i32 [[SEL]]
+;
+  %upper.zext = zext i16 %upper to i32
+  %upper.shl = shl nuw i32 %upper.zext, 16
+  %lower.zext = zext i16 %lower to i32
+  %pack = or disjoint i32 %lower.zext, %upper.shl
+  %mask.bit = and i32 %mask, 16
+  %sel = lshr i32 %pack, %mask.bit
+  %sel.masked = and i32 %sel, 65535
+  ret i32 %sel.masked
+}
+
+define i32 @selective_shift_16.masked.commute(i32 %mask, i16 %upper, i16 %lower) {
+; CHECK-LABEL: define i32 @selective_shift_16.masked.commute(
+; CHECK-SAME: i32 [[MASK:%.*]], i16 [[UPPER:%.*]], i16 [[LOWER:%.*]]) {
+; CHECK-NEXT:    [[MASK_BIT:%.*]] = and i32 [[MASK]], 16
+; CHECK-NEXT:    [[MASK_BIT_Z:%.*]] = icmp eq i32 [[MASK_BIT]], 0
+; CHECK-NEXT:    [[SEL_V:%.*]] = select i1 [[MASK_BIT_Z]], i16 [[LOWER]], i16 [[UPPER]]
+; CHECK-NEXT:    [[SEL:%.*]] = zext i16 [[SEL_V]] to i32
+; CHECK-NEXT:    ret i32 [[SEL]]
+;
+  %upper.zext = zext i16 %upper to i32
+  %upper.shl = shl nuw i32 %upper.zext, 16
+  %lower.zext = zext i16 %lower to i32
+  %pack = or disjoint i32 %upper.shl, %lower.zext
+  %mask.bit = and i32 %mask, 16
+  %sel = lshr i32 %pack, %mask.bit
+  %sel.masked = and i32 %sel, 65535
+  ret i32 %sel.masked
+}
+
+define <2 x i16> @selective_shift.v16(<2 x i32> %mask, <2 x i16> %upper, <2 x i16> %lower) {
+; CHECK-LABEL: define <2 x i16> @selective_shift.v16(
+; CHECK-SAME: <2 x i32> [[MASK:%.*]], <2 x i16> [[UPPER:%.*]], <2 x i16> [[LOWER:%.*]]) {
+; CHECK-NEXT:    [[MASK_BIT:%.*]] = and <2 x i32> [[MASK]], splat (i32 16)
+; CHECK-NEXT:    [[MASK_BIT_Z:%.*]] = icmp eq <2 x i32> [[MASK_BIT]], zeroinitializer
+; CHECK-NEXT:    [[SEL_V:%.*]] = select <2 x i1> [[MASK_BIT_Z]], <2 x i16> [[LOWER]], <2 x i16> [[UPPER]]
+; CHECK-NEXT:    ret <2 x i16> [[SEL_V]]
+;
+  %upper.zext = zext <2 x i16> %upper to <2 x i32>
+  %upper.shl = shl nuw <2 x i32> %upper.zext, splat(i32 16)
+  %lower.zext = zext <2 x i16> %lower to <2 x i32>
+  %pack = or disjoint <2 x i32> %upper.shl, %lower.zext
+  %mask.bit = and <2 x i32> %mask, splat(i32 16)
+  %sel = lshr <2 x i32> %pack, %mask.bit
+  %trunc = trunc <2 x i32> %sel to <2 x i16>
+  ret <2 x i16> %trunc
+}
+
+define i16 @selective_shift_16.wide(i64 %mask, i16 %upper, i16 %lower) {
+; CHECK-LABEL: define i16 @selective_shift_16.wide(
+; CHECK-SAME: i64 [[MASK:%.*]], i16 [[UPPER:%.*]], i16 [[LOWER:%.*]]) {
+; CHECK-NEXT:    [[MASK_BIT:%.*]] = and i64 [[MASK]], 16
+; CHECK-NEXT:    [[MASK_BIT_Z:%.*]] = icmp eq i64 [[MASK_BIT]], 0
+; CHECK-NEXT:    [[SEL_V:%.*]] = select i1 [[MASK_BIT_Z]], i16 [[LOWER]], i16 [[UPPER]]
+; CHECK-NEXT:    ret i16 [[SEL_V]]
+;
+  %upper.zext = zext i16 %upper to i64
+  %upper.shl = shl nuw i64 %upper.zext, 16
+  %lower.zext = zext i16 %lower to i64
+  %pack = or disjoint i64 %upper.shl, %lower.zext
+  %mask.bit = and i64 %mask, 16
+  %sel = lshr i64 %pack, %mask.bit
+  %trunc = trunc i64 %sel to i16
+  ret i16 %trunc
+}
+
+; narrow zext type blocks fold
+define i16 @selective_shift_16.narrow(i24 %mask, i16 %upper, i16 %lower) {
+; CHECK-LABEL: define i16 @selective_shift_16.narrow(
+; CHECK-SAME: i24 [[MASK:%.*]], i16 [[UPPER:%.*]], i16 [[LOWER:%.*]]) {
+; CHECK-NEXT:    [[UPPER_ZEXT:%.*]] = zext i16 [[UPPER]] to i24
+; CHECK-NEXT:    [[UPPER_SHL:%.*]] = shl i24 [[UPPER_ZEXT]], 16
+; CHECK-NEXT:    [[LOWER_ZEXT:%.*]] = zext i16 [[LOWER]] to i24
+; CHECK-NEXT:    [[PACK:%.*]] = or disjoint i24 [[UPPER_SHL]], [[LOWER_ZEXT]]
+; CHECK-NEXT:    [[MASK_BIT:%.*]] = and i24 [[MASK]], 16
+; CHECK-NEXT:    [[SEL:%.*]] = lshr i24 [[PACK]], [[MASK_BIT]]
+; CHECK-NEXT:    [[TRUNC:%.*]] = trunc i24 [[SEL]] to i16
+; CHECK-NEXT:    ret i16 [[TRUNC]]
+;
+  %upper.zext = zext i16 %upper to i24
+  %upper.shl = shl i24 %upper.zext, 16
+  %lower.zext = zext i16 %lower to i24
+  %pack = or disjoint i24 %upper.shl, %lower.zext
+  %mask.bit = and i24 %mask, 16
+  %sel = lshr i24 %pack, %mask.bit
+  %trunc = trunc i24 %sel to i16
+  ret i16 %trunc
+}
+
+; %lower's upper bits block fold
+define i16 @selective_shift_16_norange(i32 %mask, i32 %upper, i32 %lower) {
+; CHECK-LABEL: define i16 @selective_shift_16_norange(
+; CHECK-SAME: i32 [[MASK:%.*]], i32 [[UPPER:%.*]], i32 [[LOWER:%.*]]) {
+; CHECK-NEXT:    [[UPPER_SHL:%.*]] = shl nuw i32 [[UPPER]], 16
+; CHECK-NEXT:    [[PACK:%.*]] = or i32 [[UPPER_SHL]], [[LOWER]]
+; CHECK-NEXT:    [[MASK_BIT:%.*]] = and i32 [[MASK]], 16
+; CHECK-NEXT:    [[SEL:%.*]] = lshr i32 [[PACK]], [[MASK_BIT]]
+; CHECK-NEXT:    [[TRUNC:%.*]] = trunc i32 [[SEL]] to i16
+; CHECK-NEXT:    ret i16 [[TRUNC]]
+;
+  %upper.shl = shl nuw i32 %upper, 16
+  %pack = or i32 %upper.shl, %lower
+  %mask.bit = and i32 %mask, 16
+  %sel = lshr i32 %pack, %mask.bit
+  %trunc = trunc i32 %sel to i16
+  ret i16 %trunc
+}
+
+define i16 @selective_shift_16.mu.0(i32 %mask, i16 %upper, i16 %lower) {
+; CHECK-LABEL: define i16 @selective_shift_16.mu.0(
+; CHECK-SAME: i32 [[MASK:%.*]], i16 [[UPPER:%.*]], i16 [[LOWER:%.*]]) {
+; CHECK-NEXT:    [[UPPER_ZEXT:%.*]] = zext i16 [[UPPER]] to i32
+; CHECK-NEXT:    call void @clobber.i32(i32 [[UPPER_ZEXT]])
+; CHECK-NEXT:    [[LOWER_ZEXT:%.*]] = zext i16 [[LOWER]] to i32
+; CHECK-NEXT:    call void @clobber.i32(i32 [[LOWER_ZEXT]])
+; CHECK-NEXT:    [[MASK_BIT:%.*]] = and i32 [[MASK]], 16
+; CHECK-NEXT:    [[MASK_BIT_Z:%.*]] = icmp eq i32 [[MASK_BIT]], 0
+; CHECK-NEXT:    [[TRUNC:%.*]] = select i1 [[MASK_BIT_Z]], i16 [[LOWER]], i16 [[UPPER]]
+; CHECK-NEXT:    ret i16 [[TRUNC]]
+;
+  %upper.zext = zext i16 %upper to i32
+  call void @clobber.i32(i32 %upper.zext)
+  %upper.shl = shl nuw i32 %upper.zext, 16
+  %lower.zext = zext i16 %lower to i32
+  call void @clobber.i32(i32 %lower.zext)
+  %pack = or disjoint i32 %upper.shl, %lower.zext
+  %mask.bit = and i32 %mask, 16
+  %sel = lshr i32 %pack, %mask.bit
+  %trunc = trunc i32 %sel to i16
+  ret i16 %trunc
+}
+
+; multi-use of %pack blocks fold
+define i16 @selective_shift_16.mu.1(i32 %mask, i16 %upper, i16 %lower) {
+; CHECK-LABEL: define i16 @selective_shift_16.mu.1(
+; CHECK-SAME: i32 [[MASK:%.*]], i16 [[UPPER:%.*]], i16 [[LOWER:%.*]]) {
+; CHECK-NEXT:    [[UPPER_ZEXT:%.*]] = zext i16 [[UPPER]] to i32
+; CHECK-NEXT:    [[UPPER_SHL:%.*]] = shl nuw i32 [[UPPER_ZEXT]], 16
+; CHECK-NEXT:    [[LOWER_ZEXT:%.*]] = zext i16 [[LOWER]] to i32
+; CHECK-NEXT:    [[PACK:%.*]] = or disjoint i32 [[UPPER_SHL]], [[LOWER_ZEXT]]
+; CHECK-NEXT:    call void @clobber.i32(i32 [[PACK]])
+; CHECK-NEXT:    [[MASK_BIT:%.*]] = and i32 [[MASK]], 16
+; CHECK-NEXT:    [[SEL:%.*]] = lshr i32 [[PACK]], [[MASK_BIT]]
+; CHECK-NEXT:    [[TRUNC:%.*]] = trunc i32 [[SEL]] to i16
+; CHECK-NEXT:    ret i16 [[TRUNC]]
+;
+  %upper.zext = zext i16 %upper to i32
+  %upper.shl = shl nuw i32 %upper.zext, 16
+  %lower.zext = zext i16 %lower to i32
+  %pack = or disjoint i32 %upper.shl, %lower.zext
+  call void @clobber.i32(i32 %pack)
+  %mask.bit = and i32 %mask, 16
+  %sel = lshr i32 %pack, %mask.bit
+  %trunc = trunc i32 %sel to i16
+  ret i16 %trunc
+}
+
+; non-truncated use of %sel blocks fold
+define i16 @selective_shift_16.mu.2(i32 %mask, i16 %upper, i16 %lower) {
+; CHECK-LABEL: define i16 @selective_shift_16.mu.2(
+; CHECK-SAME: i32 [[MASK:%.*]], i16 [[UPPER:%.*]], i16 [[LOWER:%.*]]) {
+; CHECK-NEXT:    [[UPPER_ZEXT:%.*]] = zext i16 [[UPPER]] to i32
+; CHECK-NEXT:    [[UPPER_SHL:%.*]] = shl nuw i32 [[UPPER_ZEXT]], 16
+; CHECK-NEXT:    [[LOWER_ZEXT:%.*]] = zext i16 [[LOWER]] to i32
+; CHECK-NEXT:    [[PACK:%.*]] = or disjoint i32 [[UPPER_SHL]], [[LOWER_ZEXT]]
+; CHECK-NEXT:    [[MASK_BIT:%.*]] = and i32 [[MASK]], 16
+; CHECK-NEXT:    [[SEL:%.*]] = lshr i32 [[PACK]], [[MASK_BIT]]
+; CHECK-NEXT:    call void @clobber.i32(i32 [[SEL]])
+; CHECK-NEXT:    [[TRUNC:%.*]] = trunc i32 [[SEL]] to i16
+; CHECK-NEXT:    ret i16 [[TRUNC]]
+;
+  %upper.zext = zext i16 %upper to i32
+  %upper.shl = shl nuw i32 %upper.zext, 16
+  %lower.zext = zext i16 %lower to i32
+  %pack = or disjoint i32 %upper.shl, %lower.zext
+  %mask.bit = and i32 %mask, 16
+  %sel = lshr i32 %pack, %mask.bit
+  call void @clobber.i32(i32 %sel)
+  %trunc = trunc i32 %sel to i16
+  ret i16 %trunc
+}
+
+; bitwidth must be a power of 2 to fold
+define i24 @selective_shift_24(i48 %mask, i24 %upper, i24 %lower) {
+; CHECK-LABEL: define i24 @selective_shift_24(
+; CHECK-SAME: i48 [[MASK:%.*]], i24 [[UPPER:%.*]], i24 [[LOWER:%.*]]) {
+; CHECK-NEXT:    [[UPPER_ZEXT:%.*]] = zext i24 [[UPPER]] to i48
+; CHECK-NEXT:    [[UPPER_SHL:%.*]] = shl nuw i48 [[UPPER_ZEXT]], 24
+; CHECK-NEXT:    [[LOWER_ZEXT:%.*]] = zext i24 [[LOWER]] to i48
+; CHECK-NEXT:    [[PACK:%.*]] = or disjoint i48 [[UPPER_SHL]], [[LOWER_ZEXT]]
+; CHECK-NEXT:    [[MASK_BIT:%.*]] = and i48 [[MASK]], 24
+; CHECK-NEXT:    [[SEL:%.*]] = lshr i48 [[PACK]], [[MASK_BIT]]
+; CHECK-NEXT:    [[TRUNC:%.*]] = trunc i48 [[SEL]] to i24
+; CHECK-NEXT:    ret i24 [[TRUNC]]
+;
+  %upper.zext = zext i24 %upper to i48
+  %upper.shl = shl nuw i48 %upper.zext, 24
+  %lower.zext = zext i24 %lower to i48
+  %pack = or disjoint i48 %upper.shl, %lower.zext
+  %mask.bit = and i48 %mask, 24
+  %sel = lshr i48 %pack, %mask.bit
+  %trunc = trunc i48 %sel to i24
+  ret i24 %trunc
+}
+
+define i32 @selective_shift_32(i64 %mask, i32 %upper, i32 %lower) {
+; CHECK-LABEL: define i32 @selective_shift_32(
+; CHECK-SAME: i64 [[MASK:%.*]], i32 [[UPPER:%.*]], i32 [[LOWER:%.*]]) {
+; CHECK-NEXT:    [[MASK_BIT:%.*]] = and i64 [[MASK]], 32
+; CHECK-NEXT:    [[MASK_BIT_Z:%.*]] = icmp eq i64 [[MASK_BIT]], 0
+; CHECK-NEXT:    [[SEL_V:%.*]] = select i1 [[MASK_BIT_Z]], i32 [[LOWER]], i32 [[UPPER]]
+; CHECK-NEXT:    ret i32 [[SEL_V]]
+;
+  %upper.zext = zext i32 %upper to i64
+  %upper.shl = shl nuw i64 %upper.zext, 32
+  %lower.zext = zext i32 %lower to i64
+  %pack = or disjoint i64 %upper.shl, %lower.zext
+  %mask.bit = and i64 %mask, 32
+  %sel = lshr i64 %pack, %mask.bit
+  %trunc = trunc i64 %sel to i32
+  ret i32 %trunc
+}
+
+define i32 @selective_shift_32.commute(i64 %mask, i32 %upper, i32 %lower) {
+; CHECK-LABEL: define i32 @selective_shift_32.commute(
+; CHECK-SAME: i64 [[MASK:%.*]], i32 [[UPPER:%.*]], i32 [[LOWER:%.*]]) {
+; CHECK-NEXT:    [[MASK_BIT:%.*]] = and i64 [[MASK]], 32
+; CHECK-NEXT:    [[MASK_BIT_Z:%.*]] = icmp eq i64 [[MASK_BIT]], 0
+; CHECK-NEXT:    [[SEL_V:%.*]] = select i1 [[MASK_BIT_Z]], i32 [[LOWER]], i32 [[UPPER]]
+; CHECK-NEXT:    ret i32 [[SEL_V]]
+;
+  %upper.zext = zext i32 %upper to i64
+  %upper.shl = shl nuw i64 %upper.zext, 32
+  %lower.zext = zext i32 %lower to i64
+  %pack = or disjoint i64 %lower.zext, %upper.shl
+  %mask.bit = and i64 %mask, 32
+  %sel = lshr i64 %pack, %mask.bit
+  %trunc = trunc i64 %sel to i32
+  ret i32 %trunc
+}
diff --git a/llvm/test/Transforms/LoopRotate/multiple-deopt-exits.ll b/llvm/test/Transforms/LoopRotate/multiple-deopt-exits.ll
deleted file mode 100644
index 72bc543..0000000
--- a/llvm/test/Transforms/LoopRotate/multiple-deopt-exits.ll
+++ /dev/null
@@ -1,164 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S < %s -passes='loop(loop-rotate)' -loop-rotate-multi=true | FileCheck %s
-
-; Test loop rotation with multiple exits, some of them - deoptimizing.
-; We should end up with a latch which exit is non-deoptimizing, so we should rotate
-; more than once.
-
-declare i32 @llvm.experimental.deoptimize.i32(...)
-
-define i32 @test_cond_with_one_deopt_exit(ptr nonnull %a, i64 %x) {
-; Rotation done twice.
-; Latch should be at the 2nd condition (for.cond2), exiting to %return.
-;
-; CHECK-LABEL: @test_cond_with_one_deopt_exit(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[VAL_A_IDX3:%.*]] = load i32, ptr %a, align 4
-; CHECK-NEXT:    [[ZERO_CHECK4:%.*]] = icmp eq i32 [[VAL_A_IDX3]], 0
-; CHECK-NEXT:    br i1 [[ZERO_CHECK4]], label %deopt.exit, label %for.cond2.lr.ph
-; CHECK:       for.cond2.lr.ph:
-; CHECK-NEXT:    [[FOR_CHECK8:%.*]] = icmp ult i64 0, %x
-; CHECK-NEXT:    br i1 [[FOR_CHECK8]], label %for.body.lr.ph, label %return
-; CHECK:       for.body.lr.ph:
-; CHECK-NEXT:    br label %for.body
-; CHECK:       for.cond2:
-; CHECK:         [[FOR_CHECK:%.*]] = icmp ult i64 {{%.*}}, %x
-; CHECK-NEXT:    br i1 [[FOR_CHECK]], label %for.body, label %for.cond2.return_crit_edge
-; CHECK:       for.body:
-; CHECK:         br label %for.tail
-; CHECK:       for.tail:
-; CHECK:         [[VAL_A_IDX:%.*]] = load i32, ptr
-; CHECK-NEXT:    [[ZERO_CHECK:%.*]] = icmp eq i32 [[VAL_A_IDX]], 0
-; CHECK-NEXT:    br i1 [[ZERO_CHECK]], label %for.cond1.deopt.exit_crit_edge, label %for.cond2
-; CHECK:       for.cond2.return_crit_edge:
-; CHECK-NEXT:    {{%.*}} = phi i32
-; CHECK-NEXT:    br label %return
-; CHECK:       return:
-; CHECK-NEXT:    [[SUM_LCSSA2:%.*]] = phi i32
-; CHECK-NEXT:    ret i32 [[SUM_LCSSA2]]
-; CHECK:       for.cond1.deopt.exit_crit_edge:
-; CHECK-NEXT:    {{%.*}} = phi i32
-; CHECK-NEXT:    br label %deopt.exit
-; CHECK:       deopt.exit:
-; CHECK:         [[DEOPT_VAL:%.*]] = call i32 (...) @llvm.experimental.deoptimize.i32() [ "deopt"(i32 {{%.*}}) ]
-; CHECK-NEXT:    ret i32 [[DEOPT_VAL]]
-;
-entry:
-  br label %for.cond1
-
-for.cond1:
-  %idx = phi i64 [ 0, %entry ], [ %idx.next, %for.tail ]
-  %sum = phi i32 [ 0, %entry ], [ %sum.next, %for.tail ]
-  %a.idx = getelementptr inbounds i32, ptr %a, i64 %idx
-  %val.a.idx = load i32, ptr %a.idx, align 4
-  %zero.check = icmp eq i32 %val.a.idx, 0
-  br i1 %zero.check, label %deopt.exit, label %for.cond2
-
-for.cond2:
-  %for.check = icmp ult i64 %idx, %x
-  br i1 %for.check, label %for.body, label %return
-
-for.body:
-  br label %for.tail
-
-for.tail:
-  %sum.next = add i32 %sum, %val.a.idx
-  %idx.next = add nuw nsw i64 %idx, 1
-  br label %for.cond1
-
-return:
-  ret i32 %sum
-
-deopt.exit:
-  %deopt.val = call i32(...) @llvm.experimental.deoptimize.i32() [ "deopt"(i32 %val.a.idx) ]
-  ret i32 %deopt.val
-}
-
-define i32 @test_cond_with_two_deopt_exits(ptr nonnull %a, i64 %x) {
-; Rotation done three times.
-; Latch should be at the 3rd condition (for.cond3), exiting to %return.
-;
-; CHECK-LABEL: @test_cond_with_two_deopt_exits(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[A_IDX_DEREF4:%.*]] = load ptr, ptr %a
-; CHECK-NEXT:    [[NULL_CHECK5:%.*]] = icmp eq ptr [[A_IDX_DEREF4]], null
-; CHECK-NEXT:    br i1 [[NULL_CHECK5]], label %deopt.exit1, label %for.cond2.lr.ph
-; CHECK:       for.cond2.lr.ph:
-; CHECK-NEXT:    [[VAL_A_IDX9:%.*]] = load i32, ptr [[A_IDX_DEREF4]], align 4
-; CHECK-NEXT:    [[ZERO_CHECK10:%.*]] = icmp eq i32 [[VAL_A_IDX9]], 0
-; CHECK-NEXT:    br i1 [[ZERO_CHECK10]], label %deopt.exit2, label %for.cond3.lr.ph
-; CHECK:       for.cond3.lr.ph:
-; CHECK-NEXT:    [[FOR_CHECK14:%.*]] = icmp ult i64 0, %x
-; CHECK-NEXT:    br i1 [[FOR_CHECK14]], label %for.body.lr.ph, label %return
-; CHECK:       for.body.lr.ph:
-; CHECK-NEXT:    br label %for.body
-; CHECK:       for.cond2:
-; CHECK:         [[VAL_A_IDX:%.*]] = load i32, ptr
-; CHECK-NEXT:    [[ZERO_CHECK:%.*]] = icmp eq i32 [[VAL_A_IDX]], 0
-; CHECK-NEXT:    br i1 [[ZERO_CHECK]], label %for.cond2.deopt.exit2_crit_edge, label %for.cond3
-; CHECK:       for.cond3:
-; CHECK:         [[FOR_CHECK:%.*]] = icmp ult i64 {{%.*}}, %x
-; CHECK-NEXT:    br i1 [[FOR_CHECK]], label %for.body, label %for.cond3.return_crit_edge
-; CHECK:       for.body:
-; CHECK:         br label %for.tail
-; CHECK:       for.tail:
-; CHECK:         [[IDX_NEXT:%.*]] = add nuw nsw i64 {{%.*}}, 1
-; CHECK:         [[NULL_CHECK:%.*]] = icmp eq ptr {{%.*}}, null
-; CHECK-NEXT:    br i1 [[NULL_CHECK]], label %for.cond1.deopt.exit1_crit_edge, label %for.cond2
-; CHECK:       for.cond3.return_crit_edge:
-; CHECK-NEXT:    [[SPLIT18:%.*]] = phi i32
-; CHECK-NEXT:    br label %return
-; CHECK:       return:
-; CHECK-NEXT:    [[SUM_LCSSA2:%.*]] = phi i32
-; CHECK-NEXT:    ret i32 [[SUM_LCSSA2]]
-; CHECK:       for.cond1.deopt.exit1_crit_edge:
-; CHECK-NEXT:    br label %deopt.exit1
-; CHECK:       deopt.exit1:
-; CHECK-NEXT:    [[DEOPT_VAL1:%.*]] = call i32 (...) @llvm.experimental.deoptimize.i32() [ "deopt"(i32 0) ]
-; CHECK-NEXT:    ret i32 [[DEOPT_VAL1]]
-; CHECK:       for.cond2.deopt.exit2_crit_edge:
-; CHECK-NEXT:    [[SPLIT:%.*]] = phi i32
-; CHECK-NEXT:    br label %deopt.exit2
-; CHECK:       deopt.exit2:
-; CHECK-NEXT:    [[VAL_A_IDX_LCSSA:%.*]] = phi i32
-; CHECK-NEXT:    [[DEOPT_VAL2:%.*]] = call i32 (...) @llvm.experimental.deoptimize.i32() [ "deopt"(i32 [[VAL_A_IDX_LCSSA]]) ]
-; CHECK-NEXT:    ret i32 [[DEOPT_VAL2]]
-;
-entry:
-  br label %for.cond1
-
-for.cond1:
-  %idx = phi i64 [ 0, %entry ], [ %idx.next, %for.tail ]
-  %sum = phi i32 [ 0, %entry ], [ %sum.next, %for.tail ]
-  %a.idx = getelementptr inbounds ptr, ptr %a, i64 %idx
-  %a.idx.deref = load ptr, ptr %a.idx
-  %null.check = icmp eq ptr %a.idx.deref, null
-  br i1 %null.check, label %deopt.exit1, label %for.cond2
-
-for.cond2:
-  %val.a.idx = load i32, ptr %a.idx.deref, align 4
-  %zero.check = icmp eq i32 %val.a.idx, 0
-  br i1 %zero.check, label %deopt.exit2, label %for.cond3
-
-for.cond3:
-  %for.check = icmp ult i64 %idx, %x
-  br i1 %for.check, label %for.body, label %return
-
-for.body:
-  br label %for.tail
-
-for.tail:
-  %sum.next = add i32 %sum, %val.a.idx
-  %idx.next = add nuw nsw i64 %idx, 1
-  br label %for.cond1
-
-return:
-  ret i32 %sum
-
-deopt.exit1:
-  %deopt.val1 = call i32(...) @llvm.experimental.deoptimize.i32() [ "deopt"(i32 0) ]
-  ret i32 %deopt.val1
-deopt.exit2:
-  %deopt.val2 = call i32(...) @llvm.experimental.deoptimize.i32() [ "deopt"(i32 %val.a.idx) ]
-  ret i32 %deopt.val2
-}
diff --git a/llvm/test/Transforms/LoopRotate/multiple-exits.ll b/llvm/test/Transforms/LoopRotate/multiple-exits.ll
deleted file mode 100644
index 748700c..0000000
--- a/llvm/test/Transforms/LoopRotate/multiple-exits.ll
+++ /dev/null
@@ -1,236 +0,0 @@
-; RUN: opt -S -passes=loop-rotate < %s -verify-loop-info -verify-dom-info -verify-memoryssa | FileCheck %s
-
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
-target triple = "x86_64-apple-macosx10.8.0"
-
-; PR7447
-define i32 @test1(ptr nocapture %a) nounwind readonly {
-entry:
-  br label %for.cond
-
-for.cond:                                         ; preds = %for.cond1, %entry
-  %sum.0 = phi i32 [ 0, %entry ], [ %sum.1, %for.cond1 ]
-  %i.0 = phi i1 [ true, %entry ], [ false, %for.cond1 ]
-  br i1 %i.0, label %for.cond1, label %return
-
-for.cond1:                                        ; preds = %for.cond, %land.rhs
-  %sum.1 = phi i32 [ %add, %land.rhs ], [ %sum.0, %for.cond ]
-  %i.1 = phi i32 [ %inc, %land.rhs ], [ 0, %for.cond ]
-  %cmp2 = icmp ult i32 %i.1, 100
-  br i1 %cmp2, label %land.rhs, label %for.cond
-
-land.rhs:                                         ; preds = %for.cond1
-  %conv = zext i32 %i.1 to i64
-  %arrayidx = getelementptr inbounds [100 x i32], ptr %a, i64 0, i64 %conv
-  %0 = load i32, ptr %arrayidx, align 4
-  %add = add i32 %0, %sum.1
-  %cmp4 = icmp ugt i32 %add, 1000
-  %inc = add i32 %i.1, 1
-  br i1 %cmp4, label %return, label %for.cond1
-
-return:                                           ; preds = %for.cond, %land.rhs
-  %retval.0 = phi i32 [ 1000, %land.rhs ], [ %sum.0, %for.cond ]
-  ret i32 %retval.0
-
-; CHECK-LABEL: @test1(
-; CHECK: for.cond1.preheader:
-; CHECK: %sum.04 = phi i32 [ 0, %entry ], [ %sum.1.lcssa, %for.cond.loopexit ]
-; CHECK: br label %for.cond1
-
-; CHECK: for.cond1:
-; CHECK: %sum.1 = phi i32 [ %add, %land.rhs ], [ %sum.04, %for.cond1.preheader ]
-; CHECK: %i.1 = phi i32 [ %inc, %land.rhs ], [ 0, %for.cond1.preheader ]
-; CHECK: %cmp2 = icmp ult i32 %i.1, 100
-; CHECK: br i1 %cmp2, label %land.rhs, label %for.cond.loopexit
-}
-
-define void @test2(i32 %x) nounwind {
-entry:
-  br label %for.cond
-
-for.cond:                                         ; preds = %if.end, %entry
-  %i.0 = phi i32 [ 0, %entry ], [ %inc, %if.end ]
-  %cmp = icmp eq i32 %i.0, %x
-  br i1 %cmp, label %return.loopexit, label %for.body
-
-for.body:                                         ; preds = %for.cond
-  %call = tail call i32 @foo(i32 %i.0) nounwind
-  %tobool = icmp eq i32 %call, 0
-  br i1 %tobool, label %if.end, label %a
-
-if.end:                                           ; preds = %for.body
-  %call1 = tail call i32 @foo(i32 42) nounwind
-  %inc = add i32 %i.0, 1
-  br label %for.cond
-
-a:                                                ; preds = %for.body
-  %call2 = tail call i32 @bar(i32 1) nounwind
-  br label %return
-
-return.loopexit:                                  ; preds = %for.cond
-  br label %return
-
-return:                                           ; preds = %return.loopexit, %a
-  ret void
-
-; CHECK-LABEL: @test2(
-; CHECK: if.end:
-; CHECK: %inc = add i32 %i.02, 1
-; CHECK: %cmp = icmp eq i32 %inc, %x
-; CHECK: br i1 %cmp, label %for.cond.return.loopexit_crit_edge, label %for.body
-}
-
-declare i32 @foo(i32)
-
-declare i32 @bar(i32)
-
-@_ZTIi = external constant ptr
-
-; Verify dominators.
-define void @test3(i32 %x) personality ptr @__gxx_personality_v0 {
-entry:
-  %cmp2 = icmp eq i32 0, %x
-  br i1 %cmp2, label %try.cont.loopexit, label %for.body.lr.ph
-
-for.body.lr.ph:                                   ; preds = %entry
-  br label %for.body
-
-for.body:                                         ; preds = %for.body.lr.ph, %for.inc
-  %i.03 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.inc ]
-  invoke void @_Z3fooi(i32 %i.03)
-          to label %for.inc unwind label %lpad
-
-for.inc:                                          ; preds = %for.body
-  %inc = add i32 %i.03, 1
-  %cmp = icmp eq i32 %inc, %x
-  br i1 %cmp, label %for.cond.try.cont.loopexit_crit_edge, label %for.body
-
-lpad:                                             ; preds = %for.body
-  %0 = landingpad { ptr, i32 }
-          catch ptr @_ZTIi
-  %1 = extractvalue { ptr, i32 } %0, 0
-  %2 = extractvalue { ptr, i32 } %0, 1
-  %3 = tail call i32 @llvm.eh.typeid.for(ptr @_ZTIi) nounwind
-  %matches = icmp eq i32 %2, %3
-  br i1 %matches, label %catch, label %eh.resume
-
-catch:                                            ; preds = %lpad
-  %4 = tail call ptr @__cxa_begin_catch(ptr %1) nounwind
-  br i1 true, label %invoke.cont2.loopexit, label %for.body.i.lr.ph
-
-for.body.i.lr.ph:                                 ; preds = %catch
-  br label %for.body.i
-
-for.body.i:                                       ; preds = %for.body.i.lr.ph, %for.inc.i
-  %i.0.i1 = phi i32 [ 0, %for.body.i.lr.ph ], [ %inc.i, %for.inc.i ]
-  invoke void @_Z3fooi(i32 %i.0.i1)
-          to label %for.inc.i unwind label %lpad.i
-
-for.inc.i:                                        ; preds = %for.body.i
-  %inc.i = add i32 %i.0.i1, 1
-  %cmp.i = icmp eq i32 %inc.i, 0
-  br i1 %cmp.i, label %for.cond.i.invoke.cont2.loopexit_crit_edge, label %for.body.i
-
-lpad.i:                                           ; preds = %for.body.i
-  %5 = landingpad { ptr, i32 }
-          catch ptr @_ZTIi
-  %6 = extractvalue { ptr, i32 } %5, 0
-  %7 = extractvalue { ptr, i32 } %5, 1
-  %matches.i = icmp eq i32 %7, %3
-  br i1 %matches.i, label %catch.i, label %lpad1.body
-
-catch.i:                                          ; preds = %lpad.i
-  %8 = tail call ptr @__cxa_begin_catch(ptr %6) nounwind
-  invoke void @test3(i32 0)
-          to label %invoke.cont2.i unwind label %lpad1.i
-
-invoke.cont2.i:                                   ; preds = %catch.i
-  tail call void @__cxa_end_catch() nounwind
-  br label %invoke.cont2
-
-lpad1.i:                                          ; preds = %catch.i
-  %9 = landingpad { ptr, i32 }
-          cleanup
-  %10 = extractvalue { ptr, i32 } %9, 0
-  %11 = extractvalue { ptr, i32 } %9, 1
-  tail call void @__cxa_end_catch() nounwind
-  br label %lpad1.body
-
-for.cond.i.invoke.cont2.loopexit_crit_edge:       ; preds = %for.inc.i
-  br label %invoke.cont2.loopexit
-
-invoke.cont2.loopexit:                            ; preds = %for.cond.i.invoke.cont2.loopexit_crit_edge, %catch
-  br label %invoke.cont2
-
-invoke.cont2:                                     ; preds = %invoke.cont2.loopexit, %invoke.cont2.i
-  tail call void @__cxa_end_catch() nounwind
-  br label %try.cont
-
-for.cond.try.cont.loopexit_crit_edge:             ; preds = %for.inc
-  br label %try.cont.loopexit
-
-try.cont.loopexit:                                ; preds = %for.cond.try.cont.loopexit_crit_edge, %entry
-  br label %try.cont
-
-try.cont:                                         ; preds = %try.cont.loopexit, %invoke.cont2
-  ret void
-
-lpad1.body:                                       ; preds = %lpad1.i, %lpad.i
-  %exn.slot.0.i = phi ptr [ %10, %lpad1.i ], [ %6, %lpad.i ]
-  %ehselector.slot.0.i = phi i32 [ %11, %lpad1.i ], [ %7, %lpad.i ]
-  tail call void @__cxa_end_catch() nounwind
-  br label %eh.resume
-
-eh.resume:                                        ; preds = %lpad1.body, %lpad
-  %exn.slot.0 = phi ptr [ %exn.slot.0.i, %lpad1.body ], [ %1, %lpad ]
-  %ehselector.slot.0 = phi i32 [ %ehselector.slot.0.i, %lpad1.body ], [ %2, %lpad ]
-  %lpad.val = insertvalue { ptr, i32 } undef, ptr %exn.slot.0, 0
-  %lpad.val5 = insertvalue { ptr, i32 } %lpad.val, i32 %ehselector.slot.0, 1
-  resume { ptr, i32 } %lpad.val5
-}
-
-declare void @_Z3fooi(i32)
-
-declare i32 @__gxx_personality_v0(...)
-
-declare i32 @llvm.eh.typeid.for(ptr) nounwind readnone
-
-declare ptr @__cxa_begin_catch(ptr)
-
-declare void @__cxa_end_catch()
-
-define void @test4(i1 %arg) nounwind uwtable {
-entry:
-  br label %"7"
-
-"3":                                              ; preds = %"7"
-  br i1 %arg, label %"31", label %"4"
-
-"4":                                              ; preds = %"3"
-  %. = select i1 undef, float 0x3F50624DE0000000, float undef
-  %0 = add i32 %1, 1
-  br label %"7"
-
-"7":                                              ; preds = %"4", %entry
-  %1 = phi i32 [ %0, %"4" ], [ 0, %entry ]
-  %2 = icmp slt i32 %1, 100
-  br i1 %2, label %"3", label %"8"
-
-"8":                                              ; preds = %"7"
-  br i1 %arg, label %"9", label %"31"
-
-"9":                                              ; preds = %"8"
-  br label %"33"
-
-"27":                                             ; preds = %"31"
-  unreachable
-
-"31":                                             ; preds = %"8", %"3"
-  br i1 %arg, label %"27", label %"32"
-
-"32":                                             ; preds = %"31"
-  br label %"33"
-
-"33":                                             ; preds = %"32", %"9"
-  ret void
-}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/replicating-load-store-costs.ll b/llvm/test/Transforms/LoopVectorize/X86/replicating-load-store-costs.ll
index f5329cf..c225ede5 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/replicating-load-store-costs.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/replicating-load-store-costs.ll
@@ -580,6 +580,201 @@ exit:
   ret double %accum
 }
 
+define void @loaded_address_used_by_load_through_blend(i64 %start, ptr noalias %src, ptr noalias %src.2, ptr noalias %dst) #0 {
+; I64-LABEL: define void @loaded_address_used_by_load_through_blend(
+; I64-SAME: i64 [[START:%.*]], ptr noalias [[SRC:%.*]], ptr noalias [[SRC_2:%.*]], ptr noalias [[DST:%.*]]) #[[ATTR0]] {
+; I64-NEXT:  [[ENTRY:.*]]:
+; I64-NEXT:    br label %[[LOOP_HEADER:.*]]
+; I64:       [[LOOP_HEADER]]:
+; I64-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
+; I64-NEXT:    [[IV_2:%.*]] = phi i64 [ [[START]], %[[ENTRY]] ], [ [[IV_2_NEXT:%.*]], %[[LOOP_LATCH]] ]
+; I64-NEXT:    [[IV_1:%.*]] = add i64 [[IV]], 1
+; I64-NEXT:    [[GEP_SRC:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[IV_1]]
+; I64-NEXT:    [[L_SRC:%.*]] = load float, ptr [[GEP_SRC]], align 4
+; I64-NEXT:    [[C:%.*]] = fcmp oeq float [[L_SRC]], 0.000000e+00
+; I64-NEXT:    br i1 [[C]], label %[[THEN:.*]], label %[[LOOP_LATCH]]
+; I64:       [[THEN]]:
+; I64-NEXT:    [[IV_MUL:%.*]] = mul i64 [[IV_1]], [[START]]
+; I64-NEXT:    [[GEP_SRC_2:%.*]] = getelementptr i8, ptr [[SRC_2]], i64 [[IV_MUL]]
+; I64-NEXT:    br label %[[LOOP_LATCH]]
+; I64:       [[LOOP_LATCH]]:
+; I64-NEXT:    [[MERGE_GEP:%.*]] = phi ptr [ [[GEP_SRC_2]], %[[THEN]] ], [ [[SRC_2]], %[[LOOP_HEADER]] ]
+; I64-NEXT:    [[L_2:%.*]] = load float, ptr [[MERGE_GEP]], align 4
+; I64-NEXT:    [[GEP_DST:%.*]] = getelementptr i8, ptr [[DST]], i64 [[IV]]
+; I64-NEXT:    store float [[L_2]], ptr [[GEP_DST]], align 4
+; I64-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; I64-NEXT:    [[IV_2_NEXT]] = add i64 [[IV_2]], -1
+; I64-NEXT:    [[EC:%.*]] = icmp sgt i64 [[IV_2]], 100
+; I64-NEXT:    br i1 [[EC]], label %[[LOOP_HEADER]], label %[[EXIT:.*]]
+; I64:       [[EXIT]]:
+; I64-NEXT:    ret void
+;
+; I32-LABEL: define void @loaded_address_used_by_load_through_blend(
+; I32-SAME: i64 [[START:%.*]], ptr noalias [[SRC:%.*]], ptr noalias [[SRC_2:%.*]], ptr noalias [[DST:%.*]]) #[[ATTR0]] {
+; I32-NEXT:  [[ENTRY:.*:]]
+; I32-NEXT:    [[TMP0:%.*]] = add i64 [[START]], 1
+; I32-NEXT:    [[SMIN:%.*]] = call i64 @llvm.smin.i64(i64 [[START]], i64 100)
+; I32-NEXT:    [[TMP1:%.*]] = sub i64 [[TMP0]], [[SMIN]]
+; I32-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], 8
+; I32-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; I32:       [[VECTOR_PH]]:
+; I32-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP1]], 8
+; I32-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP1]], [[N_MOD_VF]]
+; I32-NEXT:    [[TMP2:%.*]] = sub i64 [[START]], [[N_VEC]]
+; I32-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i64> poison, i64 [[START]], i64 0
+; I32-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT]], <8 x i64> poison, <8 x i32> zeroinitializer
+; I32-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x ptr> poison, ptr [[SRC_2]], i64 0
+; I32-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <8 x ptr> [[BROADCAST_SPLATINSERT1]], <8 x ptr> poison, <8 x i32> zeroinitializer
+; I32-NEXT:    br label %[[VECTOR_BODY:.*]]
+; I32:       [[VECTOR_BODY]]:
+; I32-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; I32-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 0
+; I32-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 1
+; I32-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 2
+; I32-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 3
+; I32-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], 4
+; I32-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX]], 5
+; I32-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], 6
+; I32-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX]], 7
+; I32-NEXT:    [[TMP11:%.*]] = add i64 [[TMP3]], 1
+; I32-NEXT:    [[TMP12:%.*]] = add i64 [[TMP4]], 1
+; I32-NEXT:    [[TMP13:%.*]] = add i64 [[TMP5]], 1
+; I32-NEXT:    [[TMP14:%.*]] = add i64 [[TMP6]], 1
+; I32-NEXT:    [[TMP15:%.*]] = add i64 [[TMP7]], 1
+; I32-NEXT:    [[TMP16:%.*]] = add i64 [[TMP8]], 1
+; I32-NEXT:    [[TMP17:%.*]] = add i64 [[TMP9]], 1
+; I32-NEXT:    [[TMP18:%.*]] = add i64 [[TMP10]], 1
+; I32-NEXT:    [[TMP19:%.*]] = insertelement <8 x i64> poison, i64 [[TMP11]], i32 0
+; I32-NEXT:    [[TMP20:%.*]] = insertelement <8 x i64> [[TMP19]], i64 [[TMP12]], i32 1
+; I32-NEXT:    [[TMP21:%.*]] = insertelement <8 x i64> [[TMP20]], i64 [[TMP13]], i32 2
+; I32-NEXT:    [[TMP22:%.*]] = insertelement <8 x i64> [[TMP21]], i64 [[TMP14]], i32 3
+; I32-NEXT:    [[TMP23:%.*]] = insertelement <8 x i64> [[TMP22]], i64 [[TMP15]], i32 4
+; I32-NEXT:    [[TMP24:%.*]] = insertelement <8 x i64> [[TMP23]], i64 [[TMP16]], i32 5
+; I32-NEXT:    [[TMP25:%.*]] = insertelement <8 x i64> [[TMP24]], i64 [[TMP17]], i32 6
+; I32-NEXT:    [[TMP26:%.*]] = insertelement <8 x i64> [[TMP25]], i64 [[TMP18]], i32 7
+; I32-NEXT:    [[TMP27:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP11]]
+; I32-NEXT:    [[TMP28:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP12]]
+; I32-NEXT:    [[TMP29:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP13]]
+; I32-NEXT:    [[TMP30:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP14]]
+; I32-NEXT:    [[TMP31:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP15]]
+; I32-NEXT:    [[TMP32:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP16]]
+; I32-NEXT:    [[TMP33:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP17]]
+; I32-NEXT:    [[TMP34:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP18]]
+; I32-NEXT:    [[TMP35:%.*]] = load float, ptr [[TMP27]], align 4
+; I32-NEXT:    [[TMP36:%.*]] = load float, ptr [[TMP28]], align 4
+; I32-NEXT:    [[TMP37:%.*]] = load float, ptr [[TMP29]], align 4
+; I32-NEXT:    [[TMP38:%.*]] = load float, ptr [[TMP30]], align 4
+; I32-NEXT:    [[TMP39:%.*]] = load float, ptr [[TMP31]], align 4
+; I32-NEXT:    [[TMP40:%.*]] = load float, ptr [[TMP32]], align 4
+; I32-NEXT:    [[TMP41:%.*]] = load float, ptr [[TMP33]], align 4
+; I32-NEXT:    [[TMP42:%.*]] = load float, ptr [[TMP34]], align 4
+; I32-NEXT:    [[TMP43:%.*]] = insertelement <8 x float> poison, float [[TMP35]], i32 0
+; I32-NEXT:    [[TMP44:%.*]] = insertelement <8 x float> [[TMP43]], float [[TMP36]], i32 1
+; I32-NEXT:    [[TMP45:%.*]] = insertelement <8 x float> [[TMP44]], float [[TMP37]], i32 2
+; I32-NEXT:    [[TMP46:%.*]] = insertelement <8 x float> [[TMP45]], float [[TMP38]], i32 3
+; I32-NEXT:    [[TMP47:%.*]] = insertelement <8 x float> [[TMP46]], float [[TMP39]], i32 4
+; I32-NEXT:    [[TMP48:%.*]] = insertelement <8 x float> [[TMP47]], float [[TMP40]], i32 5
+; I32-NEXT:    [[TMP49:%.*]] = insertelement <8 x float> [[TMP48]], float [[TMP41]], i32 6
+; I32-NEXT:    [[TMP50:%.*]] = insertelement <8 x float> [[TMP49]], float [[TMP42]], i32 7
+; I32-NEXT:    [[TMP51:%.*]] = fcmp oeq <8 x float> [[TMP50]], zeroinitializer
+; I32-NEXT:    [[TMP52:%.*]] = mul <8 x i64> [[TMP26]], [[BROADCAST_SPLAT]]
+; I32-NEXT:    [[TMP53:%.*]] = extractelement <8 x i64> [[TMP52]], i32 0
+; I32-NEXT:    [[TMP54:%.*]] = getelementptr i8, ptr [[SRC_2]], i64 [[TMP53]]
+; I32-NEXT:    [[TMP55:%.*]] = extractelement <8 x i64> [[TMP52]], i32 1
+; I32-NEXT:    [[TMP56:%.*]] = getelementptr i8, ptr [[SRC_2]], i64 [[TMP55]]
+; I32-NEXT:    [[TMP57:%.*]] = extractelement <8 x i64> [[TMP52]], i32 2
+; I32-NEXT:    [[TMP58:%.*]] = getelementptr i8, ptr [[SRC_2]], i64 [[TMP57]]
+; I32-NEXT:    [[TMP59:%.*]] = extractelement <8 x i64> [[TMP52]], i32 3
+; I32-NEXT:    [[TMP60:%.*]] = getelementptr i8, ptr [[SRC_2]], i64 [[TMP59]]
+; I32-NEXT:    [[TMP61:%.*]] = extractelement <8 x i64> [[TMP52]], i32 4
+; I32-NEXT:    [[TMP62:%.*]] = getelementptr i8, ptr [[SRC_2]], i64 [[TMP61]]
+; I32-NEXT:    [[TMP63:%.*]] = extractelement <8 x i64> [[TMP52]], i32 5
+; I32-NEXT:    [[TMP64:%.*]] = getelementptr i8, ptr [[SRC_2]], i64 [[TMP63]]
+; I32-NEXT:    [[TMP65:%.*]] = extractelement <8 x i64> [[TMP52]], i32 6
+; I32-NEXT:    [[TMP66:%.*]] = getelementptr i8, ptr [[SRC_2]], i64 [[TMP65]]
+; I32-NEXT:    [[TMP67:%.*]] = extractelement <8 x i64> [[TMP52]], i32 7
+; I32-NEXT:    [[TMP68:%.*]] = getelementptr i8, ptr [[SRC_2]], i64 [[TMP67]]
+; I32-NEXT:    [[TMP69:%.*]] = insertelement <8 x ptr> poison, ptr [[TMP54]], i32 0
+; I32-NEXT:    [[TMP70:%.*]] = insertelement <8 x ptr> [[TMP69]], ptr [[TMP56]], i32 1
+; I32-NEXT:    [[TMP71:%.*]] = insertelement <8 x ptr> [[TMP70]], ptr [[TMP58]], i32 2
+; I32-NEXT:    [[TMP72:%.*]] = insertelement <8 x ptr> [[TMP71]], ptr [[TMP60]], i32 3
+; I32-NEXT:    [[TMP73:%.*]] = insertelement <8 x ptr> [[TMP72]], ptr [[TMP62]], i32 4
+; I32-NEXT:    [[TMP74:%.*]] = insertelement <8 x ptr> [[TMP73]], ptr [[TMP64]], i32 5
+; I32-NEXT:    [[TMP75:%.*]] = insertelement <8 x ptr> [[TMP74]], ptr [[TMP66]], i32 6
+; I32-NEXT:    [[TMP76:%.*]] = insertelement <8 x ptr> [[TMP75]], ptr [[TMP68]], i32 7
+; I32-NEXT:    [[PREDPHI:%.*]] = select <8 x i1> [[TMP51]], <8 x ptr> [[TMP76]], <8 x ptr> [[BROADCAST_SPLAT2]]
+; I32-NEXT:    [[TMP77:%.*]] = extractelement <8 x ptr> [[PREDPHI]], i32 0
+; I32-NEXT:    [[TMP78:%.*]] = load float, ptr [[TMP77]], align 4
+; I32-NEXT:    [[TMP79:%.*]] = extractelement <8 x ptr> [[PREDPHI]], i32 1
+; I32-NEXT:    [[TMP80:%.*]] = load float, ptr [[TMP79]], align 4
+; I32-NEXT:    [[TMP81:%.*]] = extractelement <8 x ptr> [[PREDPHI]], i32 2
+; I32-NEXT:    [[TMP82:%.*]] = load float, ptr [[TMP81]], align 4
+; I32-NEXT:    [[TMP83:%.*]] = extractelement <8 x ptr> [[PREDPHI]], i32 3
+; I32-NEXT:    [[TMP84:%.*]] = load float, ptr [[TMP83]], align 4
+; I32-NEXT:    [[TMP85:%.*]] = extractelement <8 x ptr> [[PREDPHI]], i32 4
+; I32-NEXT:    [[TMP86:%.*]] = load float, ptr [[TMP85]], align 4
+; I32-NEXT:    [[TMP87:%.*]] = extractelement <8 x ptr> [[PREDPHI]], i32 5
+; I32-NEXT:    [[TMP88:%.*]] = load float, ptr [[TMP87]], align 4
+; I32-NEXT:    [[TMP89:%.*]] = extractelement <8 x ptr> [[PREDPHI]], i32 6
+; I32-NEXT:    [[TMP90:%.*]] = load float, ptr [[TMP89]], align 4
+; I32-NEXT:    [[TMP91:%.*]] = extractelement <8 x ptr> [[PREDPHI]], i32 7
+; I32-NEXT:    [[TMP92:%.*]] = load float, ptr [[TMP91]], align 4
+; I32-NEXT:    [[TMP93:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP3]]
+; I32-NEXT:    [[TMP94:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP4]]
+; I32-NEXT:    [[TMP95:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP5]]
+; I32-NEXT:    [[TMP96:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP6]]
+; I32-NEXT:    [[TMP97:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP7]]
+; I32-NEXT:    [[TMP98:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP8]]
+; I32-NEXT:    [[TMP99:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP9]]
+; I32-NEXT:    [[TMP100:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP10]]
+; I32-NEXT:    store float [[TMP78]], ptr [[TMP93]], align 4
+; I32-NEXT:    store float [[TMP80]], ptr [[TMP94]], align 4
+; I32-NEXT:    store float [[TMP82]], ptr [[TMP95]], align 4
+; I32-NEXT:    store float [[TMP84]], ptr [[TMP96]], align 4
+; I32-NEXT:    store float [[TMP86]], ptr [[TMP97]], align 4
+; I32-NEXT:    store float [[TMP88]], ptr [[TMP98]], align 4
+; I32-NEXT:    store float [[TMP90]], ptr [[TMP99]], align 4
+; I32-NEXT:    store float [[TMP92]], ptr [[TMP100]], align 4
+; I32-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; I32-NEXT:    [[TMP101:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; I32-NEXT:    br i1 [[TMP101]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; I32:       [[MIDDLE_BLOCK]]:
+; I32-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]]
+; I32-NEXT:    br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]]
+; I32:       [[SCALAR_PH]]:
+;
+entry:
+  br label %loop.header
+
+loop.header:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ]
+  %iv.2 = phi i64 [ %start, %entry ], [ %iv.2.next, %loop.latch ]
+  %iv.1 = add i64 %iv, 1
+  %gep.src = getelementptr i8, ptr %src, i64 %iv.1
+  %l.src = load float, ptr %gep.src, align 4
+  %c = fcmp oeq float %l.src, 0.000000e+00
+  br i1 %c, label %then, label %loop.latch
+
+then:
+  %iv.mul = mul i64 %iv.1, %start
+  %gep.src.2 = getelementptr i8, ptr %src.2, i64 %iv.mul
+  br label %loop.latch
+
+loop.latch:
+  %merge.gep = phi ptr [ %gep.src.2, %then ], [ %src.2, %loop.header ]
+  %l.2 = load float, ptr %merge.gep, align 4
+  %gep.dst = getelementptr i8, ptr %dst, i64 %iv
+  store float %l.2, ptr %gep.dst, align 4
+  %iv.next = add i64 %iv, 1
+  %iv.2.next = add i64 %iv.2, -1
+  %ec = icmp sgt i64 %iv.2, 100
+  br i1 %ec, label %loop.header, label %exit
+
+exit:
+  ret void
+}
+
+attributes #0 = { "target-cpu"="znver3" }
 attributes #0 = { "target-cpu"="znver2" }
 
 !0 = distinct !{!0, !1}
diff --git a/llvm/test/Transforms/NewGVN/ptrtoaddr.ll b/llvm/test/Transforms/NewGVN/ptrtoaddr.ll
new file mode 100644
index 0000000..e51b42a
--- /dev/null
+++ b/llvm/test/Transforms/NewGVN/ptrtoaddr.ll
@@ -0,0 +1,29 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -S -passes=newgvn < %s | FileCheck %s
+
+define i64 @ptrtoaddr_same(ptr %p) {
+; CHECK-LABEL: define i64 @ptrtoaddr_same(
+; CHECK-SAME: ptr [[P:%.*]]) {
+; CHECK-NEXT:    ret i64 0
+;
+  %i = ptrtoaddr ptr %p to i64
+  %j = ptrtoaddr ptr %p to i64
+  %sub = sub i64 %i, %j
+  ret i64 %sub
+}
+
+; Note that unlike for ptrtoint, it's not possible for ptrtoaddr to differ
+; in result type for the same input.
+define i64 @ptrtoaddr_different(ptr %p, ptr %p2) {
+; CHECK-LABEL: define i64 @ptrtoaddr_different(
+; CHECK-SAME: ptr [[P:%.*]], ptr [[P2:%.*]]) {
+; CHECK-NEXT:    [[I:%.*]] = ptrtoaddr ptr [[P]] to i64
+; CHECK-NEXT:    [[J:%.*]] = ptrtoaddr ptr [[P2]] to i64
+; CHECK-NEXT:    [[SUB:%.*]] = sub i64 [[I]], [[J]]
+; CHECK-NEXT:    ret i64 [[SUB]]
+;
+  %i = ptrtoaddr ptr %p to i64
+  %j = ptrtoaddr ptr %p2 to i64
+  %sub = sub i64 %i, %j
+  ret i64 %sub
+}
diff --git a/llvm/test/Transforms/PhaseOrdering/switch-to-arithmetic-inlining.ll b/llvm/test/Transforms/PhaseOrdering/switch-to-arithmetic-inlining.ll
index caf7a80..7c9888f 100644
--- a/llvm/test/Transforms/PhaseOrdering/switch-to-arithmetic-inlining.ll
+++ b/llvm/test/Transforms/PhaseOrdering/switch-to-arithmetic-inlining.ll
@@ -436,10 +436,11 @@ bb104:                                            ; preds = %bb102
   br label %bb105
 }
 
+; Make sure the call is inlined.
 define i8 @test2(i8 %x) {
 ; CHECK-LABEL: define range(i8 0, 53) i8 @test2(
 ; CHECK-SAME: i8 [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
-; CHECK-NEXT:    [[CALL:%.*]] = tail call i8 @test(i8 [[X]])
+; CHECK-NEXT:    [[CALL:%.*]] = tail call range(i8 0, 53) i8 @llvm.umin.i8(i8 [[X]], i8 52)
 ; CHECK-NEXT:    ret i8 [[CALL]]
 ;
   %call = call i8 @test(i8 %x)
diff --git a/llvm/test/Transforms/SimplifyCFG/switch-transformations-no-lut.ll b/llvm/test/Transforms/SimplifyCFG/switch-transformations-no-lut.ll
index c9063d3..25267dc 100644
--- a/llvm/test/Transforms/SimplifyCFG/switch-transformations-no-lut.ll
+++ b/llvm/test/Transforms/SimplifyCFG/switch-transformations-no-lut.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -S -passes='simplifycfg' < %s | FileCheck %s --check-prefix=OPTNOLUT
+; RUN: opt -S -passes='simplifycfg<switch-to-arithmetic>' < %s | FileCheck %s --check-prefix=OPTNOLUT
 ; RUN: %if amdgpu-registered-target %{ opt -mtriple=amdgcn--amdpal -S -passes='simplifycfg<switch-to-lookup>' < %s | FileCheck %s --check-prefix=TTINOLUT %}
 ;
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
@@ -7,23 +7,11 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 define i32 @linear_transform_with_default(i32 %x) {
 ; OPTNOLUT-LABEL: define i32 @linear_transform_with_default(
 ; OPTNOLUT-SAME: i32 [[X:%.*]]) {
-; OPTNOLUT-NEXT:  [[ENTRY:.*]]:
-; OPTNOLUT-NEXT:    switch i32 [[X]], label %[[END:.*]] [
-; OPTNOLUT-NEXT:      i32 0, label %[[CASE0:.*]]
-; OPTNOLUT-NEXT:      i32 1, label %[[CASE1:.*]]
-; OPTNOLUT-NEXT:      i32 2, label %[[CASE2:.*]]
-; OPTNOLUT-NEXT:      i32 3, label %[[CASE3:.*]]
-; OPTNOLUT-NEXT:    ]
-; OPTNOLUT:       [[CASE0]]:
-; OPTNOLUT-NEXT:    br label %[[END]]
-; OPTNOLUT:       [[CASE1]]:
-; OPTNOLUT-NEXT:    br label %[[END]]
-; OPTNOLUT:       [[CASE2]]:
-; OPTNOLUT-NEXT:    br label %[[END]]
-; OPTNOLUT:       [[CASE3]]:
-; OPTNOLUT-NEXT:    br label %[[END]]
-; OPTNOLUT:       [[END]]:
-; OPTNOLUT-NEXT:    [[IDX:%.*]] = phi i32 [ 1, %[[CASE0]] ], [ 4, %[[CASE1]] ], [ 7, %[[CASE2]] ], [ 10, %[[CASE3]] ], [ 13, %[[ENTRY]] ]
+; OPTNOLUT-NEXT:  [[ENTRY:.*:]]
+; OPTNOLUT-NEXT:    [[TMP0:%.*]] = icmp ult i32 [[X]], 4
+; OPTNOLUT-NEXT:    [[SWITCH_IDX_MULT:%.*]] = mul nsw i32 [[X]], 3
+; OPTNOLUT-NEXT:    [[SWITCH_OFFSET:%.*]] = add nsw i32 [[SWITCH_IDX_MULT]], 1
+; OPTNOLUT-NEXT:    [[IDX:%.*]] = select i1 [[TMP0]], i32 [[SWITCH_OFFSET]], i32 13
 ; OPTNOLUT-NEXT:    ret i32 [[IDX]]
 ;
 ; TTINOLUT-LABEL: define i32 @linear_transform_with_default(
@@ -138,26 +126,8 @@ end:
 define i32 @linear_transform_no_default(i32 %x) {
 ; OPTNOLUT-LABEL: define i32 @linear_transform_no_default(
 ; OPTNOLUT-SAME: i32 [[X:%.*]]) {
-; OPTNOLUT-NEXT:  [[ENTRY:.*]]:
-; OPTNOLUT-NEXT:    switch i32 [[X]], label %[[DEFAULT:.*]] [
-; OPTNOLUT-NEXT:      i32 0, label %[[END:.*]]
-; OPTNOLUT-NEXT:      i32 1, label %[[CASE1:.*]]
-; OPTNOLUT-NEXT:      i32 2, label %[[CASE2:.*]]
-; OPTNOLUT-NEXT:      i32 3, label %[[CASE3:.*]]
-; OPTNOLUT-NEXT:      i32 4, label %[[CASE4:.*]]
-; OPTNOLUT-NEXT:    ]
-; OPTNOLUT:       [[CASE1]]:
-; OPTNOLUT-NEXT:    br label %[[END]]
-; OPTNOLUT:       [[CASE2]]:
-; OPTNOLUT-NEXT:    br label %[[END]]
-; OPTNOLUT:       [[CASE3]]:
-; OPTNOLUT-NEXT:    br label %[[END]]
-; OPTNOLUT:       [[CASE4]]:
-; OPTNOLUT-NEXT:    br label %[[END]]
-; OPTNOLUT:       [[DEFAULT]]:
-; OPTNOLUT-NEXT:    unreachable
-; OPTNOLUT:       [[END]]:
-; OPTNOLUT-NEXT:    [[SWITCH_IDX_MULT:%.*]] = phi i32 [ 3, %[[CASE1]] ], [ 6, %[[CASE2]] ], [ 9, %[[CASE3]] ], [ 12, %[[CASE4]] ], [ 0, %[[ENTRY]] ]
+; OPTNOLUT-NEXT:  [[ENTRY:.*:]]
+; OPTNOLUT-NEXT:    [[SWITCH_IDX_MULT:%.*]] = mul nsw i32 [[X]], 3
 ; OPTNOLUT-NEXT:    ret i32 [[SWITCH_IDX_MULT]]
 ;
 ; TTINOLUT-LABEL: define i32 @linear_transform_no_default(
@@ -350,18 +320,9 @@ end:
 define i32 @single_value_withdefault(i32 %x) {
 ; OPTNOLUT-LABEL: define i32 @single_value_withdefault(
 ; OPTNOLUT-SAME: i32 [[X:%.*]]) {
-; OPTNOLUT-NEXT:  [[ENTRY:.*]]:
-; OPTNOLUT-NEXT:    switch i32 [[X]], label %[[DEFAULT:.*]] [
-; OPTNOLUT-NEXT:      i32 0, label %[[END:.*]]
-; OPTNOLUT-NEXT:      i32 1, label %[[END]]
-; OPTNOLUT-NEXT:      i32 2, label %[[END]]
-; OPTNOLUT-NEXT:      i32 3, label %[[END]]
-; OPTNOLUT-NEXT:      i32 4, label %[[END]]
-; OPTNOLUT-NEXT:    ]
-; OPTNOLUT:       [[DEFAULT]]:
-; OPTNOLUT-NEXT:    br label %[[END]]
-; OPTNOLUT:       [[END]]:
-; OPTNOLUT-NEXT:    [[DOT:%.*]] = phi i32 [ 3, %[[DEFAULT]] ], [ 2, %[[ENTRY]] ], [ 2, %[[ENTRY]] ], [ 2, %[[ENTRY]] ], [ 2, %[[ENTRY]] ], [ 2, %[[ENTRY]] ]
+; OPTNOLUT-NEXT:  [[ENTRY:.*:]]
+; OPTNOLUT-NEXT:    [[TMP0:%.*]] = icmp ult i32 [[X]], 5
+; OPTNOLUT-NEXT:    [[DOT:%.*]] = select i1 [[TMP0]], i32 2, i32 3
 ; OPTNOLUT-NEXT:    ret i32 [[DOT]]
 ;
 ; TTINOLUT-LABEL: define i32 @single_value_withdefault(
@@ -401,18 +362,9 @@ end:
 define i32 @single_value_no_jump_tables(i32 %x) "no-jump-tables"="true" {
 ; OPTNOLUT-LABEL: define i32 @single_value_no_jump_tables(
 ; OPTNOLUT-SAME: i32 [[X:%.*]]) #[[ATTR0:[0-9]+]] {
-; OPTNOLUT-NEXT:  [[ENTRY:.*]]:
-; OPTNOLUT-NEXT:    switch i32 [[X]], label %[[DEFAULT:.*]] [
-; OPTNOLUT-NEXT:      i32 0, label %[[END:.*]]
-; OPTNOLUT-NEXT:      i32 1, label %[[END]]
-; OPTNOLUT-NEXT:      i32 2, label %[[END]]
-; OPTNOLUT-NEXT:      i32 3, label %[[END]]
-; OPTNOLUT-NEXT:      i32 4, label %[[END]]
-; OPTNOLUT-NEXT:    ]
-; OPTNOLUT:       [[DEFAULT]]:
-; OPTNOLUT-NEXT:    br label %[[END]]
-; OPTNOLUT:       [[END]]:
-; OPTNOLUT-NEXT:    [[IDX:%.*]] = phi i32 [ 3, %[[DEFAULT]] ], [ 2, %[[ENTRY]] ], [ 2, %[[ENTRY]] ], [ 2, %[[ENTRY]] ], [ 2, %[[ENTRY]] ], [ 2, %[[ENTRY]] ]
+; OPTNOLUT-NEXT:  [[ENTRY:.*:]]
+; OPTNOLUT-NEXT:    [[TMP0:%.*]] = icmp ult i32 [[X]], 5
+; OPTNOLUT-NEXT:    [[IDX:%.*]] = select i1 [[TMP0]], i32 2, i32 3
 ; OPTNOLUT-NEXT:    ret i32 [[IDX]]
 ;
 ; TTINOLUT-LABEL: define i32 @single_value_no_jump_tables(
@@ -449,6 +401,60 @@ end:
   ret i32 %idx
 }
 
+define i1 @single_value_with_mask(i32 %x) {
+; OPTNOLUT-LABEL: define i1 @single_value_with_mask(
+; OPTNOLUT-SAME: i32 [[X:%.*]]) {
+; OPTNOLUT-NEXT:  [[ENTRY:.*]]:
+; OPTNOLUT-NEXT:    switch i32 [[X]], label %[[DEFAULT:.*]] [
+; OPTNOLUT-NEXT:      i32 18, label %[[END:.*]]
+; OPTNOLUT-NEXT:      i32 21, label %[[END]]
+; OPTNOLUT-NEXT:      i32 48, label %[[END]]
+; OPTNOLUT-NEXT:      i32 16, label %[[END]]
+; OPTNOLUT-NEXT:    ]
+; OPTNOLUT:       [[DEFAULT]]:
+; OPTNOLUT-NEXT:    [[CMP:%.*]] = icmp eq i32 [[X]], 80
+; OPTNOLUT-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i1 false, i1 true
+; OPTNOLUT-NEXT:    br label %[[END]]
+; OPTNOLUT:       [[END]]:
+; OPTNOLUT-NEXT:    [[RES:%.*]] = phi i1 [ false, %[[ENTRY]] ], [ false, %[[ENTRY]] ], [ false, %[[ENTRY]] ], [ false, %[[ENTRY]] ], [ [[SEL]], %[[DEFAULT]] ]
+; OPTNOLUT-NEXT:    ret i1 [[RES]]
+;
+; TTINOLUT-LABEL: define i1 @single_value_with_mask(
+; TTINOLUT-SAME: i32 [[X:%.*]]) {
+; TTINOLUT-NEXT:  [[ENTRY:.*]]:
+; TTINOLUT-NEXT:    [[SWITCH_TABLEIDX:%.*]] = sub i32 [[X]], 16
+; TTINOLUT-NEXT:    [[TMP0:%.*]] = icmp ult i32 [[SWITCH_TABLEIDX]], 33
+; TTINOLUT-NEXT:    [[SWITCH_MASKINDEX:%.*]] = zext i32 [[SWITCH_TABLEIDX]] to i64
+; TTINOLUT-NEXT:    [[SWITCH_SHIFTED:%.*]] = lshr i64 4294967333, [[SWITCH_MASKINDEX]]
+; TTINOLUT-NEXT:    [[SWITCH_LOBIT:%.*]] = trunc i64 [[SWITCH_SHIFTED]] to i1
+; TTINOLUT-NEXT:    [[OR_COND:%.*]] = select i1 [[TMP0]], i1 [[SWITCH_LOBIT]], i1 false
+; TTINOLUT-NEXT:    br i1 [[OR_COND]], label %[[END:.*]], label %[[DEFAULT:.*]]
+; TTINOLUT:       [[DEFAULT]]:
+; TTINOLUT-NEXT:    [[CMP:%.*]] = icmp eq i32 [[X]], 80
+; TTINOLUT-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i1 false, i1 true
+; TTINOLUT-NEXT:    br label %[[END]]
+; TTINOLUT:       [[END]]:
+; TTINOLUT-NEXT:    [[RES:%.*]] = phi i1 [ [[SEL]], %[[DEFAULT]] ], [ false, %[[ENTRY]] ]
+; TTINOLUT-NEXT:    ret i1 [[RES]]
+;
+entry:
+  switch i32 %x, label %default [
+  i32 18, label %end
+  i32 21, label %end
+  i32 48, label %end
+  i32 16, label %end
+  ]
+
+default:
+  %cmp = icmp eq i32 %x, 80
+  %sel = select i1 %cmp, i1 false, i1 true
+  br label %end
+
+end:
+  %res = phi i1 [ false, %entry ], [ false, %entry ], [ false, %entry ], [ false, %entry ], [ %sel, %default ]
+  ret i1 %res
+}
+
 define i32 @lookup_table(i32 %x) {
 ; OPTNOLUT-LABEL: define i32 @lookup_table(
 ; OPTNOLUT-SAME: i32 [[X:%.*]]) {
diff --git a/llvm/tools/llvm-gpu-loader/amdhsa.cpp b/llvm/tools/llvm-gpu-loader/amdhsa.cpp
index be1b6b7..5715058 100644
--- a/llvm/tools/llvm-gpu-loader/amdhsa.cpp
+++ b/llvm/tools/llvm-gpu-loader/amdhsa.cpp
@@ -192,7 +192,7 @@ hsa_status_t launch_kernel(hsa_agent_t dev_agent, hsa_executable_t executable,
   // Initialize all the arguments (explicit and implicit) to zero, then set the
   // explicit arguments to the values created above.
   std::memset(args, 0, args_size);
-  std::memcpy(args, &kernel_args, sizeof(args_t));
+  std::memcpy(args, &kernel_args, std::is_empty_v<args_t> ? 0 : sizeof(args_t));
 
   // Initialize the necessary implicit arguments to the proper values.
   int dims = 1 + (params.num_blocks_y * params.num_threads_y != 1) +
@@ -563,7 +563,7 @@ int load_amdhsa(int argc, const char **argv, const char **envp, void *image,
   // Save the return value and perform basic clean-up.
   int ret = *static_cast<int *>(host_ret);
 
-  end_args_t fini_args = {ret};
+  end_args_t fini_args = {};
   if (hsa_status_t err = launch_kernel(
           dev_agent, executable, kernargs_pool, coarsegrained_pool, queue,
           server, single_threaded_params, "_end.kd", fini_args,
diff --git a/llvm/tools/llvm-gpu-loader/llvm-gpu-loader.h b/llvm/tools/llvm-gpu-loader/llvm-gpu-loader.h
index ed34d0b..08861c2 100644
--- a/llvm/tools/llvm-gpu-loader/llvm-gpu-loader.h
+++ b/llvm/tools/llvm-gpu-loader/llvm-gpu-loader.h
@@ -41,9 +41,7 @@ struct start_args_t {
 };
 
 /// The arguments to the '_end' kernel.
-struct end_args_t {
-  int argc;
-};
+struct end_args_t {};
 
 /// Generic interface to load the \p image and launch execution of the _start
 /// kernel on the target device. Copies \p argc and \p argv to the device.
diff --git a/llvm/tools/llvm-gpu-loader/nvptx.cpp b/llvm/tools/llvm-gpu-loader/nvptx.cpp
index 781a045..82b4552 100644
--- a/llvm/tools/llvm-gpu-loader/nvptx.cpp
+++ b/llvm/tools/llvm-gpu-loader/nvptx.cpp
@@ -177,7 +177,7 @@ CUresult launch_kernel(CUmodule binary, CUstream stream, rpc::Server &server,
     handle_error(err);
 
   // Set up the arguments to the '_start' kernel on the GPU.
-  uint64_t args_size = sizeof(args_t);
+  uint64_t args_size = std::is_empty_v<args_t> ? 0 : sizeof(args_t);
   void *args_config[] = {CU_LAUNCH_PARAM_BUFFER_POINTER, &kernel_args,
                          CU_LAUNCH_PARAM_BUFFER_SIZE, &args_size,
                          CU_LAUNCH_PARAM_END};
@@ -342,7 +342,7 @@ int load_nvptx(int argc, const char **argv, const char **envp, void *image,
   if (CUresult err = cuStreamSynchronize(stream))
     handle_error(err);
 
-  end_args_t fini_args = {host_ret};
+  end_args_t fini_args = {};
   if (CUresult err =
           launch_kernel(binary, stream, server, single_threaded_params, "_end",
                         fini_args, print_resource_usage))
diff --git a/llvm/unittests/BinaryFormat/DwarfTest.cpp b/llvm/unittests/BinaryFormat/DwarfTest.cpp
index 684e59f..f4519f6 100644
--- a/llvm/unittests/BinaryFormat/DwarfTest.cpp
+++ b/llvm/unittests/BinaryFormat/DwarfTest.cpp
@@ -219,4 +219,77 @@ TEST(DwarfTest, lname) {
   EXPECT_EQ(roundtrip(DW_LANG_##NAME), DW_LANG_##NAME);
 #include "llvm/BinaryFormat/Dwarf.def"
 }
+
+TEST(DwarfTest, lname_getSourceLanguageName) {
+  // Some basics.
+  EXPECT_EQ(getSourceLanguageName("DW_LNAME_Ada"), DW_LNAME_Ada);
+  EXPECT_EQ(getSourceLanguageName("DW_LNAME_Metal"), DW_LNAME_Metal);
+
+  // Test invalid input.
+  EXPECT_EQ(getSourceLanguageName(""), 0U);
+  EXPECT_EQ(getSourceLanguageName("blah"), 0U);
+  EXPECT_EQ(getSourceLanguageName("DW_LNAME__something_unlikely"), 0U);
+  EXPECT_EQ(getSourceLanguageName("DW_LANG_C"), 0U);
+
+  // Test that we cover all DW_LNAME_ names.
+#define xstr(X) #X
+#define HANDLE_DW_LNAME(ID, NAME, DESC, LOWER_BOUND)                           \
+  EXPECT_EQ(getSourceLanguageName(xstr(DW_LNAME_##NAME)), DW_LNAME_##NAME);
+#include "llvm/BinaryFormat/Dwarf.def"
+}
+
+TEST(DwarfTest, lname_SourceLanguageNameString) {
+  // Some basics.
+  EXPECT_EQ(SourceLanguageNameString(DW_LNAME_C_plus_plus),
+            "DW_LNAME_C_plus_plus");
+  EXPECT_EQ(SourceLanguageNameString(DW_LNAME_CPP_for_OpenCL),
+            "DW_LNAME_CPP_for_OpenCL");
+
+  // Test invalid input.
+  EXPECT_EQ(SourceLanguageNameString(static_cast<SourceLanguageName>(0)), "");
+
+  // Test that we cover all DW_LNAME_ names.
+#define xstr(X) #X
+#define HANDLE_DW_LNAME(ID, NAME, DESC, LOWER_BOUND)                           \
+  EXPECT_EQ(SourceLanguageNameString(DW_LNAME_##NAME), xstr(DW_LNAME_##NAME));
+#include "llvm/BinaryFormat/Dwarf.def"
+}
+
+TEST(DWARFDebugInfo, TestLanguageDescription_Versioned) {
+  // Tests for the llvm::dwarf::LanguageDescription API that
+  // takes a name *and* a version.
+
+  // Unknown language.
+  EXPECT_EQ(
+      llvm::dwarf::LanguageDescription(static_cast<SourceLanguageName>(0)),
+      "Unknown");
+
+  EXPECT_EQ(
+      llvm::dwarf::LanguageDescription(static_cast<SourceLanguageName>(0), 0),
+      "Unknown");
+
+  // Test that specifying an invalid version falls back to a valid language name
+  // regardless.
+  EXPECT_EQ(llvm::dwarf::LanguageDescription(DW_LNAME_ObjC, 0), "Objective C");
+  EXPECT_EQ(llvm::dwarf::LanguageDescription(DW_LNAME_Julia, 0), "Julia");
+
+  // Check some versions.
+  EXPECT_EQ(llvm::dwarf::LanguageDescription(DW_LNAME_C_plus_plus, 199711),
+            "C++98");
+  EXPECT_EQ(llvm::dwarf::LanguageDescription(DW_LNAME_C_plus_plus, 201402),
+            "C++14");
+
+  // Versions round up.
+  EXPECT_EQ(llvm::dwarf::LanguageDescription(DW_LNAME_C_plus_plus, 201400),
+            "C++14");
+
+  // Version 0 for C and C++ is an unversioned name.
+  EXPECT_EQ(llvm::dwarf::LanguageDescription(DW_LNAME_C, 0), "C (K&R and ISO)");
+  EXPECT_EQ(llvm::dwarf::LanguageDescription(DW_LNAME_C_plus_plus, 0),
+            "ISO C++");
+
+  // Version 0 for other versioned languages may not be the unversioned name.
+  EXPECT_EQ(llvm::dwarf::LanguageDescription(DW_LNAME_Fortran, 0),
+            "FORTRAN 77");
+}
 } // end namespace
diff --git a/llvm/unittests/CodeGen/InstrRefLDVTest.cpp b/llvm/unittests/CodeGen/InstrRefLDVTest.cpp
index 3a625b2..ce2a38b 100644
--- a/llvm/unittests/CodeGen/InstrRefLDVTest.cpp
+++ b/llvm/unittests/CodeGen/InstrRefLDVTest.cpp
@@ -100,8 +100,8 @@ public:
     // scope.
     DIBuilder DIB(*Mod);
     OurFile = DIB.createFile("xyzzy.c", "/cave");
-    OurCU =
-        DIB.createCompileUnit(dwarf::DW_LANG_C99, OurFile, "nou", false, "", 0);
+    OurCU = DIB.createCompileUnit(DISourceLanguageName(dwarf::DW_LANG_C99),
+                                  OurFile, "nou", false, "", 0);
     auto OurSubT = DIB.createSubroutineType(DIB.getOrCreateTypeArray({}));
     OurFunc =
         DIB.createFunction(OurCU, "bees", "", OurFile, 1, OurSubT, 1,
diff --git a/llvm/unittests/CodeGen/LexicalScopesTest.cpp b/llvm/unittests/CodeGen/LexicalScopesTest.cpp
index 34bd37a..0c6b932 100644
--- a/llvm/unittests/CodeGen/LexicalScopesTest.cpp
+++ b/llvm/unittests/CodeGen/LexicalScopesTest.cpp
@@ -102,8 +102,8 @@ public:
     // scope.
     DIBuilder DIB(Mod);
     OurFile = DIB.createFile("xyzzy.c", "/cave");
-    OurCU =
-        DIB.createCompileUnit(dwarf::DW_LANG_C99, OurFile, "nou", false, "", 0);
+    OurCU = DIB.createCompileUnit(DISourceLanguageName(dwarf::DW_LANG_C99),
+                                  OurFile, "nou", false, "", 0);
     OurSubT = DIB.createSubroutineType(DIB.getOrCreateTypeArray({}));
     OurFunc =
         DIB.createFunction(OurCU, "bees", "", OurFile, 1, OurSubT, 1,
diff --git a/llvm/unittests/CodeGen/MachineBasicBlockTest.cpp b/llvm/unittests/CodeGen/MachineBasicBlockTest.cpp
index bcb5a18..ef0d40b 100644
--- a/llvm/unittests/CodeGen/MachineBasicBlockTest.cpp
+++ b/llvm/unittests/CodeGen/MachineBasicBlockTest.cpp
@@ -40,8 +40,8 @@ TEST(FindDebugLocTest, DifferentIterators) {
   // scope.
   DIBuilder DIB(Mod);
   DIFile *OurFile = DIB.createFile("foo.c", "/bar");
-  DICompileUnit *OurCU =
-      DIB.createCompileUnit(dwarf::DW_LANG_C99, OurFile, "", false, "", 0);
+  DICompileUnit *OurCU = DIB.createCompileUnit(
+      DISourceLanguageName(dwarf::DW_LANG_C99), OurFile, "", false, "", 0);
   auto OurSubT = DIB.createSubroutineType(DIB.getOrCreateTypeArray({}));
   DISubprogram *OurFunc =
       DIB.createFunction(OurCU, "bees", "", OurFile, 1, OurSubT, 1,
diff --git a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
index c13570d..e568723 100644
--- a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
+++ b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
@@ -11,6 +11,7 @@
 #include "llvm/Frontend/OpenMP/OMPIRBuilder.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/DIBuilder.h"
+#include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/InstIterator.h"
 #include "llvm/IR/Instructions.h"
@@ -212,8 +213,8 @@ protected:
     DIBuilder DIB(*M);
     auto File = DIB.createFile("test.dbg", "/src", std::nullopt,
                                std::optional<StringRef>("/src/test.dbg"));
-    auto CU =
-        DIB.createCompileUnit(dwarf::DW_LANG_C, File, "llvm-C", true, "", 0);
+    auto CU = DIB.createCompileUnit(DISourceLanguageName(dwarf::DW_LANG_C),
+                                    File, "llvm-C", true, "", 0);
     auto Type = DIB.createSubroutineType(DIB.getOrCreateTypeArray({}));
     auto SP = DIB.createFunction(
         CU, "foo", "", File, 1, Type, 1, DINode::FlagZero,
diff --git a/llvm/unittests/IR/DebugInfoTest.cpp b/llvm/unittests/IR/DebugInfoTest.cpp
index 475e0a9..060f45d 100644
--- a/llvm/unittests/IR/DebugInfoTest.cpp
+++ b/llvm/unittests/IR/DebugInfoTest.cpp
@@ -409,7 +409,8 @@ TEST(DIBuilder, CreateFortranArrayTypeWithAttributes) {
 
   DIFile *F = DIB.createFile("main.c", "/");
   DICompileUnit *CU = DIB.createCompileUnit(
-      dwarf::DW_LANG_C, DIB.createFile("main.c", "/"), "llvm-c", true, "", 0);
+      DISourceLanguageName(dwarf::DW_LANG_C), DIB.createFile("main.c", "/"),
+      "llvm-c", true, "", 0);
 
   DIVariable *DataLocation =
       DIB.createTempGlobalVariableFwdDecl(CU, "dl", "_dl", F, 1, nullptr, true);
@@ -1335,8 +1336,8 @@ TEST(DIBuilder, HashingDISubprogram) {
   DIBuilder DIB(*M);
 
   DIFile *F = DIB.createFile("main.c", "/");
-  DICompileUnit *CU =
-      DIB.createCompileUnit(dwarf::DW_LANG_C, F, "Test", false, "", 0);
+  DICompileUnit *CU = DIB.createCompileUnit(
+      DISourceLanguageName(dwarf::DW_LANG_C), F, "Test", false, "", 0);
 
   llvm::TempDIType ForwardDeclaredType =
       llvm::TempDIType(DIB.createReplaceableCompositeType(
@@ -1381,8 +1382,8 @@ TEST(DIBuilder, CompositeTypes) {
   DIBuilder DIB(*M);
 
   DIFile *F = DIB.createFile("main.c", "/");
-  DICompileUnit *CU =
-      DIB.createCompileUnit(dwarf::DW_LANG_C, F, "Test", false, "", 0);
+  DICompileUnit *CU = DIB.createCompileUnit(
+      DISourceLanguageName(dwarf::DW_LANG_C), F, "Test", false, "", 0);
 
   DICompositeType *Class =
       DIB.createClassType(CU, "MyClass", F, 0, 8, 8, 0, {}, nullptr, {}, 0,
diff --git a/llvm/unittests/IR/IRBuilderTest.cpp b/llvm/unittests/IR/IRBuilderTest.cpp
index 773c32e..37826b2 100644
--- a/llvm/unittests/IR/IRBuilderTest.cpp
+++ b/llvm/unittests/IR/IRBuilderTest.cpp
@@ -6,11 +6,12 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Analysis/InstSimplifyFolder.h"
 #include "llvm/IR/IRBuilder.h"
+#include "llvm/Analysis/InstSimplifyFolder.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/DIBuilder.h"
 #include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/IntrinsicsAArch64.h"
@@ -859,8 +860,8 @@ TEST_F(IRBuilderTest, createFunction) {
   IRBuilder<> Builder(BB);
   DIBuilder DIB(*M);
   auto File = DIB.createFile("error.swift", "/");
-  auto CU =
-      DIB.createCompileUnit(dwarf::DW_LANG_Swift, File, "swiftc", true, "", 0);
+  auto CU = DIB.createCompileUnit(DISourceLanguageName(dwarf::DW_LANG_Swift),
+                                  File, "swiftc", true, "", 0);
   auto Type = DIB.createSubroutineType(DIB.getOrCreateTypeArray({}));
   auto NoErr = DIB.createFunction(
       CU, "noerr", "", File, 1, Type, 1, DINode::FlagZero,
@@ -893,9 +894,9 @@ TEST_F(IRBuilderTest, DIBuilder) {
     IRBuilder<> Builder(BB);
     DIBuilder DIB(*M);
     auto File = DIB.createFile("F.CBL", "/");
-    auto CU = DIB.createCompileUnit(dwarf::DW_LANG_Cobol74,
-                                    DIB.createFile("F.CBL", "/"),
-                                    "llvm-cobol74", true, "", 0);
+    auto CU = DIB.createCompileUnit(
+        DISourceLanguageName(dwarf::DW_LANG_Cobol74),
+        DIB.createFile("F.CBL", "/"), "llvm-cobol74", true, "", 0);
     auto Type = DIB.createSubroutineType(DIB.getOrCreateTypeArray({}));
     auto SP = DIB.createFunction(
         CU, "foo", "", File, 1, Type, 1, DINode::FlagZero,
@@ -1004,7 +1005,8 @@ TEST_F(IRBuilderTest, createArtificialSubprogram) {
   IRBuilder<> Builder(BB);
   DIBuilder DIB(*M);
   auto File = DIB.createFile("main.c", "/");
-  auto CU = DIB.createCompileUnit(dwarf::DW_LANG_C, File, "clang",
+  auto CU = DIB.createCompileUnit(DISourceLanguageName(dwarf::DW_LANG_C), File,
+                                  "clang",
                                   /*isOptimized=*/true, /*Flags=*/"",
                                   /*Runtime Version=*/0);
   auto Type = DIB.createSubroutineType(DIB.getOrCreateTypeArray({}));
@@ -1083,7 +1085,8 @@ TEST_F(IRBuilderTest, appendDebugInfo) {
   {
     DIBuilder DIB(*M);
     auto *File = DIB.createFile("main.c", "/");
-    CU = DIB.createCompileUnit(dwarf::DW_LANG_C, File, "clang",
+    CU = DIB.createCompileUnit(DISourceLanguageName(dwarf::DW_LANG_C), File,
+                               "clang",
                                /*isOptimized=*/true, /*Flags=*/"",
                                /*Runtime Version=*/0);
     auto *ByteTy = DIB.createBasicType("byte0", 8, dwarf::DW_ATE_signed);
@@ -1158,9 +1161,9 @@ TEST_F(IRBuilderTest, DebugLoc) {
 
   DIBuilder DIB(*M);
   auto File = DIB.createFile("tmp.cpp", "/");
-  auto CU = DIB.createCompileUnit(dwarf::DW_LANG_C_plus_plus_11,
-                                  DIB.createFile("tmp.cpp", "/"), "", true, "",
-                                  0);
+  auto CU =
+      DIB.createCompileUnit(DISourceLanguageName(dwarf::DW_LANG_C_plus_plus_11),
+                            DIB.createFile("tmp.cpp", "/"), "", true, "", 0);
   auto SPType = DIB.createSubroutineType(DIB.getOrCreateTypeArray({}));
   auto SP =
       DIB.createFunction(CU, "foo", "foo", File, 1, SPType, 1, DINode::FlagZero,
@@ -1191,9 +1194,8 @@ TEST_F(IRBuilderTest, DIImportedEntity) {
   IRBuilder<> Builder(BB);
   DIBuilder DIB(*M);
   auto F = DIB.createFile("F.CBL", "/");
-  auto CU = DIB.createCompileUnit(dwarf::DW_LANG_Cobol74,
-                                  F, "llvm-cobol74",
-                                  true, "", 0);
+  auto CU = DIB.createCompileUnit(DISourceLanguageName(dwarf::DW_LANG_Cobol74),
+                                  F, "llvm-cobol74", true, "", 0);
   MDTuple *Elements = MDTuple::getDistinct(Ctx, {});
 
   DIB.createImportedDeclaration(CU, nullptr, F, 1);
@@ -1218,8 +1220,9 @@ TEST_F(IRBuilderTest, DIBuilderMacro) {
   DIBuilder DIB(*M);
   auto File1 = DIB.createFile("main.c", "/");
   auto File2 = DIB.createFile("file.h", "/");
-  auto CU = DIB.createCompileUnit(
-      dwarf::DW_LANG_C, DIB.createFile("main.c", "/"), "llvm-c", true, "", 0);
+  auto CU = DIB.createCompileUnit(DISourceLanguageName(dwarf::DW_LANG_C),
+                                  DIB.createFile("main.c", "/"), "llvm-c", true,
+                                  "", 0);
   auto MDef0 =
       DIB.createMacro(nullptr, 0, dwarf::DW_MACINFO_define, "M0", "V0");
   auto TMF1 = DIB.createTempMacroFile(nullptr, 0, File1);
diff --git a/llvm/unittests/IR/MetadataTest.cpp b/llvm/unittests/IR/MetadataTest.cpp
index 7425703..85c79d1 100644
--- a/llvm/unittests/IR/MetadataTest.cpp
+++ b/llvm/unittests/IR/MetadataTest.cpp
@@ -101,8 +101,8 @@ protected:
   }
   DICompileUnit *getUnit() {
     return DICompileUnit::getDistinct(
-        Context, 1, getFile(), "clang", false, "-g", 2, "",
-        DICompileUnit::FullDebug, getTuple(), getTuple(), getTuple(),
+        Context, DISourceLanguageName(1), getFile(), "clang", false, "-g", 2,
+        "", DICompileUnit::FullDebug, getTuple(), getTuple(), getTuple(),
         getTuple(), getTuple(), 0, true, false,
         DICompileUnit::DebugNameTableKind::Default, false, "/", "");
   }
@@ -2896,13 +2896,14 @@ TEST_F(DICompileUnitTest, get) {
   StringRef SysRoot = "/";
   StringRef SDK = "MacOSX.sdk";
   auto *N = DICompileUnit::getDistinct(
-      Context, SourceLanguage, File, Producer, IsOptimized, Flags,
-      RuntimeVersion, SplitDebugFilename, EmissionKind, EnumTypes,
-      RetainedTypes, GlobalVariables, ImportedEntities, Macros, DWOId, true,
-      false, DICompileUnit::DebugNameTableKind::Default, false, SysRoot, SDK);
+      Context, DISourceLanguageName(SourceLanguage), File, Producer,
+      IsOptimized, Flags, RuntimeVersion, SplitDebugFilename, EmissionKind,
+      EnumTypes, RetainedTypes, GlobalVariables, ImportedEntities, Macros,
+      DWOId, true, false, DICompileUnit::DebugNameTableKind::Default, false,
+      SysRoot, SDK);
 
   EXPECT_EQ(dwarf::DW_TAG_compile_unit, N->getTag());
-  EXPECT_EQ(SourceLanguage, N->getSourceLanguage());
+  EXPECT_EQ(SourceLanguage, N->getSourceLanguage().getUnversionedName());
   EXPECT_EQ(File, N->getFile());
   EXPECT_EQ(Producer, N->getProducer());
   EXPECT_EQ(IsOptimized, N->isOptimized());
@@ -2921,7 +2922,7 @@ TEST_F(DICompileUnitTest, get) {
 
   TempDICompileUnit Temp = N->clone();
   EXPECT_EQ(dwarf::DW_TAG_compile_unit, Temp->getTag());
-  EXPECT_EQ(SourceLanguage, Temp->getSourceLanguage());
+  EXPECT_EQ(SourceLanguage, Temp->getSourceLanguage().getUnversionedName());
   EXPECT_EQ(File, Temp->getFile());
   EXPECT_EQ(Producer, Temp->getProducer());
   EXPECT_EQ(IsOptimized, Temp->isOptimized());
@@ -2959,10 +2960,10 @@ TEST_F(DICompileUnitTest, replaceArrays) {
   StringRef SysRoot = "/";
   StringRef SDK = "MacOSX.sdk";
   auto *N = DICompileUnit::getDistinct(
-      Context, SourceLanguage, File, Producer, IsOptimized, Flags,
-      RuntimeVersion, SplitDebugFilename, EmissionKind, EnumTypes,
-      RetainedTypes, nullptr, ImportedEntities, nullptr, DWOId, true, false,
-      DICompileUnit::DebugNameTableKind::Default, false, SysRoot, SDK);
+      Context, DISourceLanguageName(SourceLanguage), File, Producer,
+      IsOptimized, Flags, RuntimeVersion, SplitDebugFilename, EmissionKind,
+      EnumTypes, RetainedTypes, nullptr, ImportedEntities, nullptr, DWOId, true,
+      false, DICompileUnit::DebugNameTableKind::Default, false, SysRoot, SDK);
 
   auto *GlobalVariables = MDTuple::getDistinct(Context, {});
   EXPECT_EQ(nullptr, N->getGlobalVariables().get());
diff --git a/llvm/unittests/IR/VerifierTest.cpp b/llvm/unittests/IR/VerifierTest.cpp
index 7a136e6..440db12 100644
--- a/llvm/unittests/IR/VerifierTest.cpp
+++ b/llvm/unittests/IR/VerifierTest.cpp
@@ -9,6 +9,7 @@
 #include "llvm/IR/Verifier.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DIBuilder.h"
+#include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalAlias.h"
@@ -232,8 +233,9 @@ TEST(VerifierTest, DetectInvalidDebugInfo) {
     LLVMContext C;
     Module M("M", C);
     DIBuilder DIB(M);
-    DIB.createCompileUnit(dwarf::DW_LANG_C89, DIB.createFile("broken.c", "/"),
-                          "unittest", false, "", 0);
+    DIB.createCompileUnit(DISourceLanguageName(dwarf::DW_LANG_C89),
+                          DIB.createFile("broken.c", "/"), "unittest", false,
+                          "", 0);
     DIB.finalize();
     EXPECT_FALSE(verifyModule(M));
 
@@ -247,7 +249,7 @@ TEST(VerifierTest, DetectInvalidDebugInfo) {
     LLVMContext C;
     Module M("M", C);
     DIBuilder DIB(M);
-    auto *CU = DIB.createCompileUnit(dwarf::DW_LANG_C89,
+    auto *CU = DIB.createCompileUnit(DISourceLanguageName(dwarf::DW_LANG_C89),
                                      DIB.createFile("broken.c", "/"),
                                      "unittest", false, "", 0);
     new GlobalVariable(M, Type::getInt8Ty(C), false,
diff --git a/llvm/unittests/Support/SpecialCaseListTest.cpp b/llvm/unittests/Support/SpecialCaseListTest.cpp
index 5be2b9e..750feda 100644
--- a/llvm/unittests/Support/SpecialCaseListTest.cpp
+++ b/llvm/unittests/Support/SpecialCaseListTest.cpp
@@ -22,33 +22,31 @@ namespace {
 
 class SpecialCaseListTest : public ::testing::Test {
 protected:
-  std::unique_ptr<SpecialCaseList> makeSpecialCaseList(StringRef List,
-                                                       std::string &Error,
-                                                       bool UseGlobs = true) {
+  std::unique_ptr<SpecialCaseList>
+  makeSpecialCaseList(StringRef List, std::string &Error, int Version = 0) {
     auto S = List.str();
-    if (!UseGlobs)
-      S = (Twine("#!special-case-list-v1\n") + S).str();
+    if (Version)
+      S = (Twine("#!special-case-list-v") + Twine(Version) + "\n" + S).str();
     std::unique_ptr<MemoryBuffer> MB = MemoryBuffer::getMemBuffer(S);
     return SpecialCaseList::create(MB.get(), Error);
   }
 
   std::unique_ptr<SpecialCaseList> makeSpecialCaseList(StringRef List,
-                                                       bool UseGlobs = true) {
+                                                       int Version = 0) {
     std::string Error;
-    auto SCL = makeSpecialCaseList(List, Error, UseGlobs);
+    auto SCL = makeSpecialCaseList(List, Error, Version);
     assert(SCL);
     assert(Error == "");
     return SCL;
   }
 
-  std::string makeSpecialCaseListFile(StringRef Contents,
-                                      bool UseGlobs = true) {
+  std::string makeSpecialCaseListFile(StringRef Contents, int Version = 0) {
     int FD;
     SmallString<64> Path;
     sys::fs::createTemporaryFile("SpecialCaseListTest", "temp", FD, Path);
     raw_fd_ostream OF(FD, true, true);
-    if (!UseGlobs)
-      OF << "#!special-case-list-v1\n";
+    if (Version)
+      OF << "#!special-case-list-v" << Version << "\n";
     OF << Contents;
     OF.close();
     return std::string(Path.str());
@@ -261,7 +259,7 @@ TEST_F(SpecialCaseListTest, Version1) {
                           "fun:foo.*\n"
                           "fun:abc|def\n"
                           "fun:b.r\n",
-                          /*UseGlobs=*/false);
+                          /*Version=*/1);
 
   EXPECT_TRUE(SCL->inSection("sect1", "fun", "fooz"));
   EXPECT_TRUE(SCL->inSection("sect2", "fun", "fooz"));
@@ -309,6 +307,46 @@ TEST_F(SpecialCaseListTest, Version2) {
   EXPECT_FALSE(SCL->inSection("sect3", "fun", "bar"));
 }
 
+TEST_F(SpecialCaseListTest, DotSlash) {
+  std::unique_ptr<SpecialCaseList> SCL2 = makeSpecialCaseList("[dot]\n"
+                                                              "fun:./foo\n"
+                                                              "src:./bar\n"
+                                                              "[not]\n"
+                                                              "fun:foo\n"
+                                                              "src:bar\n");
+  std::unique_ptr<SpecialCaseList> SCL3 = makeSpecialCaseList("[dot]\n"
+                                                              "fun:./foo\n"
+                                                              "src:./bar\n"
+                                                              "[not]\n"
+                                                              "fun:foo\n"
+                                                              "src:bar\n",
+                                                              /*Version=*/3);
+
+  EXPECT_TRUE(SCL2->inSection("dot", "fun", "./foo"));
+  EXPECT_TRUE(SCL3->inSection("dot", "fun", "./foo"));
+
+  EXPECT_FALSE(SCL2->inSection("dot", "fun", "foo"));
+  EXPECT_FALSE(SCL3->inSection("dot", "fun", "foo"));
+
+  EXPECT_TRUE(SCL2->inSection("dot", "src", "./bar"));
+  EXPECT_FALSE(SCL3->inSection("dot", "src", "./bar"));
+
+  EXPECT_FALSE(SCL2->inSection("dot", "src", "bar"));
+  EXPECT_FALSE(SCL3->inSection("dot", "src", "bar"));
+
+  EXPECT_FALSE(SCL2->inSection("not", "fun", "./foo"));
+  EXPECT_FALSE(SCL3->inSection("not", "fun", "./foo"));
+
+  EXPECT_TRUE(SCL2->inSection("not", "fun", "foo"));
+  EXPECT_TRUE(SCL3->inSection("not", "fun", "foo"));
+
+  EXPECT_FALSE(SCL2->inSection("not", "src", "./bar"));
+  EXPECT_TRUE(SCL3->inSection("not", "src", "./bar"));
+
+  EXPECT_TRUE(SCL2->inSection("not", "src", "bar"));
+  EXPECT_TRUE(SCL3->inSection("not", "src", "bar"));
+}
+
 TEST_F(SpecialCaseListTest, LinesInSection) {
   std::unique_ptr<SpecialCaseList> SCL = makeSpecialCaseList("fun:foo\n"
                                                              "fun:bar\n"
diff --git a/llvm/unittests/Transforms/Utils/CloningTest.cpp b/llvm/unittests/Transforms/Utils/CloningTest.cpp
index fe81986..d990808 100644
--- a/llvm/unittests/Transforms/Utils/CloningTest.cpp
+++ b/llvm/unittests/Transforms/Utils/CloningTest.cpp
@@ -18,6 +18,7 @@
 #include "llvm/IR/Constant.h"
 #include "llvm/IR/DIBuilder.h"
 #include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstIterator.h"
@@ -482,10 +483,10 @@ protected:
     DITypeRefArray ParamTypes = DBuilder.getOrCreateTypeArray({});
     DISubroutineType *FuncType =
         DBuilder.createSubroutineType(ParamTypes);
-    auto *CU = DBuilder.createCompileUnit(dwarf::DW_LANG_C99,
-                                          DBuilder.createFile("filename.c",
-                                                              "/file/dir"),
-                                          "CloneFunc", false, "", 0);
+    auto *CU = DBuilder.createCompileUnit(
+        DISourceLanguageName(dwarf::DW_LANG_C99),
+        DBuilder.createFile("filename.c", "/file/dir"), "CloneFunc", false, "",
+        0);
 
     auto *Subprogram = DBuilder.createFunction(
         CU, "f", "f", File, 4, FuncType, 3, DINode::FlagZero,
@@ -540,7 +541,7 @@ protected:
 
     // Create another, empty, compile unit.
     DIBuilder DBuilder2(*M);
-    DBuilder2.createCompileUnit(dwarf::DW_LANG_C99,
+    DBuilder2.createCompileUnit(DISourceLanguageName(dwarf::DW_LANG_C99),
                                 DBuilder.createFile("extra.c", "/file/dir"),
                                 "CloneFunc", false, "", 0);
     DBuilder2.finalize();
@@ -953,8 +954,9 @@ protected:
       // confirm that compile units get cloned in the correct order.
       DIBuilder EmptyBuilder(*OldM);
       auto *File = EmptyBuilder.createFile("empty.c", "/file/dir/");
-      (void)EmptyBuilder.createCompileUnit(dwarf::DW_LANG_C99, File,
-                                           "EmptyUnit", false, "", 0);
+      (void)EmptyBuilder.createCompileUnit(
+          DISourceLanguageName(dwarf::DW_LANG_C99), File, "EmptyUnit", false,
+          "", 0);
       EmptyBuilder.finalize();
     }
 
@@ -973,10 +975,10 @@ protected:
     auto *File = DBuilder.createFile("filename.c", "/file/dir/");
     DITypeRefArray ParamTypes = DBuilder.getOrCreateTypeArray({});
     DISubroutineType *DFuncType = DBuilder.createSubroutineType(ParamTypes);
-    auto *CU = DBuilder.createCompileUnit(dwarf::DW_LANG_C99,
-                                          DBuilder.createFile("filename.c",
-                                                              "/file/dir"),
-                                          "CloneModule", false, "", 0);
+    auto *CU = DBuilder.createCompileUnit(
+        DISourceLanguageName(dwarf::DW_LANG_C99),
+        DBuilder.createFile("filename.c", "/file/dir"), "CloneModule", false,
+        "", 0);
     // Function DI
     auto *Subprogram = DBuilder.createFunction(
         CU, "f", "f", File, 4, DFuncType, 3, DINode::FlagZero,
diff --git a/mlir/include/mlir/Dialect/EmitC/IR/EmitC.td b/mlir/include/mlir/Dialect/EmitC/IR/EmitC.td
index f52eb7b..5754db6 100644
--- a/mlir/include/mlir/Dialect/EmitC/IR/EmitC.td
+++ b/mlir/include/mlir/Dialect/EmitC/IR/EmitC.td
@@ -1393,7 +1393,7 @@ def EmitC_AssignOp : EmitC_Op<"assign", []> {
 }
 
 def EmitC_YieldOp : EmitC_Op<"yield",
-      [Pure, Terminator, ParentOneOf<["ExpressionOp", "IfOp", "ForOp", "SwitchOp"]>]> {
+      [Pure, Terminator, ParentOneOf<["DoOp", "ExpressionOp", "ForOp", "IfOp", "SwitchOp"]>]> {
   let summary = "Block termination operation";
   let description = [{
     The `emitc.yield` terminates its parent EmitC op's region, optionally yielding
@@ -1725,4 +1725,105 @@ def EmitC_GetFieldOp
   let hasVerifier = 1;
 }
 
+def EmitC_DoOp : EmitC_Op<"do",
+      [NoTerminator, OpAsmOpInterface, RecursiveMemoryEffects]> {
+  let summary = "Do-while operation";
+  let description = [{
+    The `emitc.do` operation represents a C/C++ do-while loop construct that
+    repeatedly executes a body region as long as a condition region evaluates to
+    true. The operation has two regions:
+
+    1. A body region that contains the loop body
+    2. A condition region that must yield a boolean value (i1)
+
+    The condition is evaluated before each iteration as follows:
+    - The condition region must contain exactly one block with:
+      1. An `emitc.expression` operation producing an i1 value
+      2. An `emitc.yield` passing through the expression result
+    - The expression's body contains the actual condition logic
+
+    The body region is executed before the first evaluation of the 
+    condition. Thus, there is a guarantee that the loop will be executed 
+    at least once. The loop terminates when the condition yields false.
+
+    The canonical structure of `emitc.do` is:
+
+    ```mlir
+    emitc.do {
+      // Body region (no terminator required).
+      // Loop body operations...
+    } while {
+      // Condition region (must yield i1)
+      %condition = emitc.expression : () -> i1 {
+        // Condition computation...
+        %result = ... : i1  // Last operation must produce i1
+        emitc.yield %result : i1
+      }
+      // Forward expression result
+      emitc.yield %condition : i1  
+    }
+    ```
+
+    Example:
+
+    ```mlir
+    emitc.func @do_example() {
+      %counter = "emitc.variable"() <{value = 0 : i32}> : () -> !emitc.lvalue<i32>
+      %end = emitc.literal "10" : i32
+      %step = emitc.literal "1" : i32
+
+      emitc.do {
+        // Print current value
+        %val = emitc.load %counter : !emitc.lvalue<i32>
+        emitc.verbatim "printf(\"%d\\n\", {});" args %val : i32
+
+        // Increment counter
+        %new_val = emitc.add %val, %step : (i32, i32) -> i32
+        "emitc.assign"(%counter, %new_val) : (!emitc.lvalue<i32>, i32) -> ()
+      } while {
+        %condition = emitc.expression %counter, %end : (!emitc.lvalue<i32>, i32) -> i1 {
+          %current = emitc.load %counter : !emitc.lvalue<i32>
+          %cmp_res = emitc.cmp lt, %current, %end : (i32, i32) -> i1
+          emitc.yield %cmp_res : i1
+        }
+        emitc.yield %condition : i1
+      }
+      return
+    }
+    ```
+    ```c++
+    // Code emitted for the operation above.
+    void do_example() {
+      int32_t v1 = 0;
+      do {
+        int32_t v2 = v1;
+        printf("%d\n", v2);
+        int32_t v3 = v2 + 1;
+        v1 = v3;
+      } while (v1 < 10);
+      return;
+    }
+    ```
+  }];
+
+  let arguments = (ins);
+  let results = (outs); 
+  let regions = (region SizedRegion<1>:$bodyRegion,
+                        SizedRegion<1>:$conditionRegion);
+
+  let hasCustomAssemblyFormat = 1;
+  let hasVerifier = 1;
+
+  let extraClassDeclaration = [{
+    //===------------------------------------------------------------------===//
+    // OpAsmOpInterface Methods
+    //===------------------------------------------------------------------===//
+
+    /// EmitC ops in the body can omit their 'emitc.' prefix in the assembly.
+    static ::llvm::StringRef getDefaultDialect() {
+      return "emitc";
+    }
+  }];
+}
+
 #endif // MLIR_DIALECT_EMITC_IR_EMITC
diff --git a/mlir/include/mlir/Dialect/Linalg/IR/LinalgRelayoutOps.td b/mlir/include/mlir/Dialect/Linalg/IR/LinalgRelayoutOps.td
index 3390f38..6504ca8 100644
--- a/mlir/include/mlir/Dialect/Linalg/IR/LinalgRelayoutOps.td
+++ b/mlir/include/mlir/Dialect/Linalg/IR/LinalgRelayoutOps.td
@@ -57,7 +57,11 @@ class Linalg_RelayoutOp<string mnemonic, list<Trait> traits = []> :
     /// tile factors.
     DenseMap<int64_t, OpFoldResult> getDimAndTileMapping();
 
-    /// Return the tile sizes as OpFoldResult.
+    // TODO: Return the folded result.
+    /// Return the tile sizes as OpFoldResult. Will return the Value
+    /// of the constant Op, not the constant Attribute.
+    /// E.g., for %size = arith.constant 1 : i32 will return %size,
+    /// not 1.
     SmallVector<OpFoldResult> getMixedTiles();
 
     /// Return the tile sizes as `int64_t`. If a tile size is dynamic
diff --git a/mlir/include/mlir/Dialect/OpenACC/OpenACCTypeInterfaces.td b/mlir/include/mlir/Dialect/OpenACC/OpenACCTypeInterfaces.td
index 9123ac3..0d16255 100644
--- a/mlir/include/mlir/Dialect/OpenACC/OpenACCTypeInterfaces.td
+++ b/mlir/include/mlir/Dialect/OpenACC/OpenACCTypeInterfaces.td
@@ -70,6 +70,86 @@ def OpenACC_PointerLikeTypeInterface : TypeInterface<"PointerLikeType"> {
         return ::mlir::acc::VariableTypeCategory::uncategorized;
       }]
     >,
+    InterfaceMethod<
+      /*description=*/[{
+        Generates allocation operations for the pointer-like type. It will create
+        an allocate that produces memory space for an instance of the current type.
+
+        The `varName` parameter is optional and can be used to provide a name
+        for the allocated variable. If the current type is represented
+        in a way that it does not capture the pointee type, `varType` must be
+        passed in to provide the necessary type information.
+
+        The `originalVar` parameter is optional but enables support for dynamic
+        types (e.g., dynamic memrefs). When provided, implementations can extract
+        runtime dimension information from the original variable to create
+        allocations with matching dynamic sizes.
+
+        Returns a Value representing the result of the allocation. If no value
+        is returned, it means the allocation was not successfully generated.
+      }],
+      /*retTy=*/"::mlir::Value",
+      /*methodName=*/"genAllocate",
+      /*args=*/(ins "::mlir::OpBuilder &":$builder,
+                    "::mlir::Location":$loc,
+                    "::llvm::StringRef":$varName,
+                    "::mlir::Type":$varType,
+                    "::mlir::Value":$originalVar),
+      /*methodBody=*/"",
+      /*defaultImplementation=*/[{
+        return {};
+      }]
+    >,
+    InterfaceMethod<
+      /*description=*/[{
+        Generates deallocation operations for the pointer-like type. It deallocates
+        the instance provided.
+
+        The `varPtr` parameter is required and must represent an instance that was
+        previously allocated. If the current type is represented in a way that it
+        does not capture the pointee type, `varType` must be passed in to provide
+        the necessary type information. Nothing is generated in case the allocate
+        is `alloca`-like.
+
+        Returns true if deallocation was successfully generated or successfully
+        deemed as not needed to be generated, false otherwise.
+      }],
+      /*retTy=*/"bool",
+      /*methodName=*/"genFree",
+      /*args=*/(ins "::mlir::OpBuilder &":$builder,
+                    "::mlir::Location":$loc,
+                    "::mlir::TypedValue<::mlir::acc::PointerLikeType>":$varPtr,
+                    "::mlir::Type":$varType),
+      /*methodBody=*/"",
+      /*defaultImplementation=*/[{
+        return false;
+      }]
+    >,
+    InterfaceMethod<
+      /*description=*/[{
+        Generates copy operations for the pointer-like type. It copies the memory
+        from the source to the destination. Typically used to initialize one
+        variable of this type from another.
+
+        The `destination` and `source` parameters represent the target and source
+        instances respectively. If the current type is represented in a way that it
+        does not capture the pointee type, `varType` must be passed in to provide
+        the necessary type information.
+
+        Returns true if copy was successfully generated, false otherwise.
+      }],
+      /*retTy=*/"bool",
+      /*methodName=*/"genCopy",
+      /*args=*/(ins "::mlir::OpBuilder &":$builder,
+                    "::mlir::Location":$loc,
+                    "::mlir::TypedValue<::mlir::acc::PointerLikeType>":$destination,
+                    "::mlir::TypedValue<::mlir::acc::PointerLikeType>":$source,
+                    "::mlir::Type":$varType),
+      /*methodBody=*/"",
+      /*defaultImplementation=*/[{
+        return false;
+      }]
+    >,
   ];
 }
 
diff --git a/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td b/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td
index 252c0b7..bb5d686 100644
--- a/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td
+++ b/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td
@@ -806,6 +806,7 @@ def Vector_ToElementsOp : Vector_Op<"to_elements", [
   let results = (outs Variadic<AnyType>:$elements);
   let assemblyFormat = "$source attr-dict `:` type($source)";
   let hasFolder = 1;
+  let hasCanonicalizer = 1;
 }
 
 def Vector_FromElementsOp : Vector_Op<"from_elements", [
diff --git a/mlir/lib/Conversion/SCFToEmitC/SCFToEmitC.cpp b/mlir/lib/Conversion/SCFToEmitC/SCFToEmitC.cpp
index 1f239aa..519d9c8 100644
--- a/mlir/lib/Conversion/SCFToEmitC/SCFToEmitC.cpp
+++ b/mlir/lib/Conversion/SCFToEmitC/SCFToEmitC.cpp
@@ -21,6 +21,7 @@
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/Transforms/DialectConversion.h"
 #include "mlir/Transforms/Passes.h"
+#include "llvm/Support/LogicalResult.h"
 
 namespace mlir {
 #define GEN_PASS_DEF_SCFTOEMITC
@@ -106,7 +107,7 @@ static void assignValues(ValueRange values, ValueRange variables,
     emitc::AssignOp::create(rewriter, loc, var, value);
 }
 
-SmallVector<Value> loadValues(const SmallVector<Value> &variables,
+SmallVector<Value> loadValues(ArrayRef<Value> variables,
                               PatternRewriter &rewriter, Location loc) {
   return llvm::map_to_vector<>(variables, [&](Value var) {
     Type type = cast<emitc::LValueType>(var.getType()).getValueType();
@@ -116,16 +117,15 @@ SmallVector<Value> loadValues(const SmallVector<Value> &variables,
 
 static LogicalResult lowerYield(Operation *op, ValueRange resultVariables,
                                 ConversionPatternRewriter &rewriter,
-                                scf::YieldOp yield) {
+                                scf::YieldOp yield, bool createYield = true) {
   Location loc = yield.getLoc();
 
   OpBuilder::InsertionGuard guard(rewriter);
   rewriter.setInsertionPoint(yield);
 
   SmallVector<Value> yieldOperands;
-  if (failed(rewriter.getRemappedValues(yield.getOperands(), yieldOperands))) {
+  if (failed(rewriter.getRemappedValues(yield.getOperands(), yieldOperands)))
     return rewriter.notifyMatchFailure(op, "failed to lower yield operands");
-  }
 
   assignValues(yieldOperands, resultVariables, rewriter, loc);
 
@@ -336,11 +336,177 @@ LogicalResult IndexSwitchOpLowering::matchAndRewrite(
   return success();
 }
 
+// Lower scf::while to emitc::do using mutable variables to maintain loop state
+// across iterations. The do-while structure ensures the condition is evaluated
+// after each iteration, matching SCF while semantics.
+struct WhileLowering : public OpConversionPattern<WhileOp> {
+  using OpConversionPattern::OpConversionPattern;
+
+  LogicalResult
+  matchAndRewrite(WhileOp whileOp, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    Location loc = whileOp.getLoc();
+    MLIRContext *context = loc.getContext();
+
+    // Create an emitc::variable op for each result. These variables will be
+    // assigned to by emitc::assign ops within the loop body.
+    SmallVector<Value> resultVariables;
+    if (failed(createVariablesForResults(whileOp, getTypeConverter(), rewriter,
+                                         resultVariables)))
+      return rewriter.notifyMatchFailure(whileOp,
+                                         "Failed to create result variables");
+
+    // Create variable storage for loop-carried values to enable imperative
+    // updates while maintaining SSA semantics at conversion boundaries.
+    SmallVector<Value> loopVariables;
+    if (failed(createVariablesForLoopCarriedValues(
+            whileOp, rewriter, loopVariables, loc, context)))
+      return failure();
+
+    if (failed(lowerDoWhile(whileOp, loopVariables, resultVariables, context,
+                            rewriter, loc)))
+      return failure();
+
+    rewriter.setInsertionPointAfter(whileOp);
+
+    // Load the final result values from result variables.
+    SmallVector<Value> finalResults =
+        loadValues(resultVariables, rewriter, loc);
+    rewriter.replaceOp(whileOp, finalResults);
+
+    return success();
+  }
+
+private:
+  // Initialize variables for loop-carried values to enable state updates
+  // across iterations without SSA argument passing.
+  LogicalResult createVariablesForLoopCarriedValues(
+      WhileOp whileOp, ConversionPatternRewriter &rewriter,
+      SmallVectorImpl<Value> &loopVars, Location loc,
+      MLIRContext *context) const {
+    OpBuilder::InsertionGuard guard(rewriter);
+    rewriter.setInsertionPoint(whileOp);
+
+    emitc::OpaqueAttr noInit = emitc::OpaqueAttr::get(context, "");
+
+    for (Value init : whileOp.getInits()) {
+      Type convertedType = getTypeConverter()->convertType(init.getType());
+      if (!convertedType)
+        return rewriter.notifyMatchFailure(whileOp, "type conversion failed");
+
+      emitc::VariableOp var = rewriter.create<emitc::VariableOp>(
+          loc, emitc::LValueType::get(convertedType), noInit);
+      rewriter.create<emitc::AssignOp>(loc, var.getResult(), init);
+      loopVars.push_back(var);
+    }
+
+    return success();
+  }
+
+  // Lower scf.while to emitc.do.
+  LogicalResult lowerDoWhile(WhileOp whileOp, ArrayRef<Value> loopVars,
+                             ArrayRef<Value> resultVars, MLIRContext *context,
+                             ConversionPatternRewriter &rewriter,
+                             Location loc) const {
+    // Create a global boolean variable to store the loop condition state.
+    Type i1Type = IntegerType::get(context, 1);
+    auto globalCondition =
+        rewriter.create<emitc::VariableOp>(loc, emitc::LValueType::get(i1Type),
+                                           emitc::OpaqueAttr::get(context, ""));
+    Value conditionVal = globalCondition.getResult();
+
+    auto loweredDo = rewriter.create<emitc::DoOp>(loc);
+
+    // Convert region types to match the target dialect type system.
+    if (failed(rewriter.convertRegionTypes(&whileOp.getBefore(),
+                                           *getTypeConverter(), nullptr)) ||
+        failed(rewriter.convertRegionTypes(&whileOp.getAfter(),
+                                           *getTypeConverter(), nullptr))) {
+      return rewriter.notifyMatchFailure(whileOp,
+                                         "region types conversion failed");
+    }
+
+    // Prepare the before region (condition evaluation) for merging.
+    Block *beforeBlock = &whileOp.getBefore().front();
+    Block *bodyBlock = rewriter.createBlock(&loweredDo.getBodyRegion());
+    rewriter.setInsertionPointToStart(bodyBlock);
+
+    // Load current variable values to use as initial arguments for the
+    // condition block.
+    SmallVector<Value> replacingValues = loadValues(loopVars, rewriter, loc);
+    rewriter.mergeBlocks(beforeBlock, bodyBlock, replacingValues);
+
+    Operation *condTerminator =
+        loweredDo.getBodyRegion().back().getTerminator();
+    scf::ConditionOp condOp = cast<scf::ConditionOp>(condTerminator);
+    rewriter.setInsertionPoint(condOp);
+
+    // Update result variables with values from scf::condition.
+    SmallVector<Value> conditionArgs;
+    for (Value arg : condOp.getArgs()) {
+      conditionArgs.push_back(rewriter.getRemappedValue(arg));
+    }
+    assignValues(conditionArgs, resultVars, rewriter, loc);
+
+    // Convert scf.condition to condition variable assignment.
+    Value condition = rewriter.getRemappedValue(condOp.getCondition());
+    rewriter.create<emitc::AssignOp>(loc, conditionVal, condition);
+
+    // Wrap body region in conditional to preserve scf semantics. Only create
+    // ifOp if after-region is non-empty.
+    if (whileOp.getAfterBody()->getOperations().size() > 1) {
+      auto ifOp = rewriter.create<emitc::IfOp>(loc, condition, false, false);
+
+      // Prepare the after region (loop body) for merging.
+      Block *afterBlock = &whileOp.getAfter().front();
+      Block *ifBodyBlock = rewriter.createBlock(&ifOp.getBodyRegion());
+
+      // Replacement values for after block using condition op arguments.
+      SmallVector<Value> afterReplacingValues;
+      for (Value arg : condOp.getArgs())
+        afterReplacingValues.push_back(rewriter.getRemappedValue(arg));
+
+      rewriter.mergeBlocks(afterBlock, ifBodyBlock, afterReplacingValues);
+
+      if (failed(lowerYield(whileOp, loopVars, rewriter,
+                            cast<scf::YieldOp>(ifBodyBlock->getTerminator()))))
+        return failure();
+    }
+
+    rewriter.eraseOp(condOp);
+
+    // Create condition region that loads from the flag variable.
+    Region &condRegion = loweredDo.getConditionRegion();
+    Block *condBlock = rewriter.createBlock(&condRegion);
+    rewriter.setInsertionPointToStart(condBlock);
+
+    auto exprOp = rewriter.create<emitc::ExpressionOp>(
+        loc, i1Type, conditionVal, /*do_not_inline=*/false);
+    Block *exprBlock = rewriter.createBlock(&exprOp.getBodyRegion());
+
+    // Set up the expression block to load the condition variable.
+    exprBlock->addArgument(conditionVal.getType(), loc);
+    rewriter.setInsertionPointToStart(exprBlock);
+
+    // Load the condition value and yield it as the expression result.
+    Value cond =
+        rewriter.create<emitc::LoadOp>(loc, i1Type, exprBlock->getArgument(0));
+    rewriter.create<emitc::YieldOp>(loc, cond);
+
+    // Yield the expression as the condition region result.
+    rewriter.setInsertionPointToEnd(condBlock);
+    rewriter.create<emitc::YieldOp>(loc, exprOp);
+
+    return success();
+  }
+};
+
 void mlir::populateSCFToEmitCConversionPatterns(RewritePatternSet &patterns,
                                                 TypeConverter &typeConverter) {
   patterns.add<ForLowering>(typeConverter, patterns.getContext());
   patterns.add<IfLowering>(typeConverter, patterns.getContext());
   patterns.add<IndexSwitchOpLowering>(typeConverter, patterns.getContext());
+  patterns.add<WhileLowering>(typeConverter, patterns.getContext());
 }
 
 void SCFToEmitCPass::runOnOperation() {
@@ -357,7 +523,8 @@ void SCFToEmitCPass::runOnOperation() {
 
   // Configure conversion to lower out SCF operations.
   ConversionTarget target(getContext());
-  target.addIllegalOp<scf::ForOp, scf::IfOp, scf::IndexSwitchOp>();
+  target
+      .addIllegalOp<scf::ForOp, scf::IfOp, scf::IndexSwitchOp, scf::WhileOp>();
   target.markUnknownOpDynamicallyLegal([](Operation *) { return true; });
   if (failed(
           applyPartialConversion(getOperation(), target, std::move(patterns))))
diff --git a/mlir/lib/Conversion/XeVMToLLVM/XeVMToLLVM.cpp b/mlir/lib/Conversion/XeVMToLLVM/XeVMToLLVM.cpp
index 57877b8..f449d90 100644
--- a/mlir/lib/Conversion/XeVMToLLVM/XeVMToLLVM.cpp
+++ b/mlir/lib/Conversion/XeVMToLLVM/XeVMToLLVM.cpp
@@ -214,6 +214,10 @@ static std::optional<LoadCacheControl> getCacheControl(BlockLoad2dOp op) {
   return op.getCacheControl();
 }
 
+static std::optional<LoadCacheControl> getCacheControl(BlockLoadOp op) {
+  return op.getCacheControl();
+}
+
 static std::optional<LoadCacheControl> getCacheControl(BlockPrefetch2dOp op) {
   return op.getCacheControl();
 }
@@ -222,6 +226,10 @@ static std::optional<StoreCacheControl> getCacheControl(BlockStore2dOp op) {
   return op.getCacheControl();
 }
 
+static std::optional<StoreCacheControl> getCacheControl(BlockStoreOp op) {
+  return op.getCacheControl();
+}
+
 static std::optional<LoadCacheControl> getCacheControl(LLVM::LoadOp op) {
   if (op->hasAttr("cache_control")) {
     auto attr = op->getAttrOfType<xevm::LoadCacheControlAttr>("cache_control");
@@ -263,6 +271,7 @@ getCacheControlMetadata(ConversionPatternRewriter &rewriter, OpType op) {
   constexpr bool isLoad = std::is_same_v<OpType, BlockLoad2dOp> ||
                           std::is_same_v<OpType, BlockPrefetch2dOp> ||
                           std::is_same_v<OpType, LLVM::LoadOp> ||
+                          std::is_same_v<OpType, BlockLoadOp> ||
                           std::is_same_v<OpType, PrefetchOp>;
   const int32_t controlKey{isLoad ? loadCacheControlKey : storeCacheControlKey};
   SmallVector<int32_t, decorationCacheControlArity> decorationsL1{
@@ -618,6 +627,77 @@ class LoadStorePrefetchToOCLPattern : public OpConversionPattern<OpType> {
     return success();
   }
 };
+
+template <typename OpType>
+class BlockLoadStore1DToOCLPattern : public OpConversionPattern<OpType> {
+  using OpConversionPattern<OpType>::OpConversionPattern;
+  LogicalResult
+  matchAndRewrite(OpType op, typename OpType::Adaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    constexpr bool isStore = std::is_same_v<OpType, xevm::BlockStoreOp>;
+    // Get OpenCL function name
+    // https://registry.khronos.org/OpenCL/extensions/
+    //         intel/cl_intel_subgroup_local_block_io.html
+    std::string funcName{"intel_sub_group_block_"};
+    // Value or Result type can be vector or scalar
+    Type valOrResTy;
+    if constexpr (isStore) {
+      funcName += "write_u";
+      valOrResTy = op.getVal().getType();
+    } else {
+      funcName += "read_u";
+      valOrResTy = op.getType();
+    }
+    // Get element type of the vector/scalar
+    VectorType vecTy = dyn_cast<VectorType>(valOrResTy);
+    Type elemType = vecTy ? vecTy.getElementType() : valOrResTy;
+    funcName += getTypeMangling(elemType);
+    if (vecTy)
+      funcName += std::to_string(vecTy.getNumElements());
+    SmallVector<Type, 2> argTypes{};
+    // XeVM BlockLoad/StoreOp always use signless integer types
+    // but OpenCL builtins expect unsigned types
+    // use unsigned types for mangling
+    SmallVector<bool, 2> isUnsigned{};
+    // arg0: pointer to the src/dst address
+    // arg1 - only if store : vector to store
+    // Prepare arguments
+    SmallVector<Value, 2> args{};
+    args.push_back(op.getPtr());
+    argTypes.push_back(op.getPtr().getType());
+    isUnsigned.push_back(true);
+    Type retType;
+    if constexpr (isStore) {
+      args.push_back(op.getVal());
+      argTypes.push_back(op.getVal().getType());
+      isUnsigned.push_back(true);
+      retType = LLVM::LLVMVoidType::get(rewriter.getContext());
+    } else {
+      retType = valOrResTy;
+    }
+    funcName = std::string("_Z") + std::to_string(funcName.size()) + funcName +
+               "PU3AS" +
+               std::to_string(op.getPtr().getType().getAddressSpace());
+    funcName += getTypeMangling(elemType, /*isUnsigned=*/true);
+    if constexpr (isStore)
+      funcName += getTypeMangling(valOrResTy, /*isUnsigned=*/true);
+    LLVMFuncAttributeOptions funcAttr{noUnwindWillReturnAttrs};
+
+    LLVM::CallOp call =
+        createDeviceFunctionCall(rewriter, funcName, retType, argTypes, args,
+                                 {}, funcAttr, op.getOperation());
+    if (std::optional<ArrayAttr> optCacheControls =
+            getCacheControlMetadata(rewriter, op)) {
+      call->setAttr(XeVMDialect::getCacheControlsAttrName(), *optCacheControls);
+    }
+    if constexpr (isStore)
+      rewriter.eraseOp(op);
+    else
+      rewriter.replaceOp(op, call->getResult(0));
+    return success();
+  }
+};
+
 template <typename OpType>
 class LLVMLoadStoreToOCLPattern : public OpConversionPattern<OpType> {
   using OpConversionPattern<OpType>::OpConversionPattern;
@@ -693,7 +773,10 @@ void ::mlir::populateXeVMToLLVMConversionPatterns(ConversionTarget &target,
                LoadStorePrefetchToOCLPattern<BlockPrefetch2dOp>,
                MMAToOCLPattern, MemfenceToOCLPattern, PrefetchToOCLPattern,
                LLVMLoadStoreToOCLPattern<LLVM::LoadOp>,
-               LLVMLoadStoreToOCLPattern<LLVM::StoreOp>>(patterns.getContext());
+               LLVMLoadStoreToOCLPattern<LLVM::StoreOp>,
+               BlockLoadStore1DToOCLPattern<BlockLoadOp>,
+               BlockLoadStore1DToOCLPattern<BlockStoreOp>>(
+      patterns.getContext());
 }
 
 void ::mlir::registerConvertXeVMToLLVMInterface(DialectRegistry &registry) {
diff --git a/mlir/lib/Dialect/EmitC/IR/EmitC.cpp b/mlir/lib/Dialect/EmitC/IR/EmitC.cpp
index 5c8564b..4754f0b 100644
--- a/mlir/lib/Dialect/EmitC/IR/EmitC.cpp
+++ b/mlir/lib/Dialect/EmitC/IR/EmitC.cpp
@@ -974,10 +974,10 @@ LogicalResult emitc::YieldOp::verify() {
   Value result = getResult();
   Operation *containingOp = getOperation()->getParentOp();
 
-  if (result && containingOp->getNumResults() != 1)
+  if (!isa<DoOp>(containingOp) && result && containingOp->getNumResults() != 1)
     return emitOpError() << "yields a value not returned by parent";
 
-  if (!result && containingOp->getNumResults() != 0)
+  if (!isa<DoOp>(containingOp) && !result && containingOp->getNumResults() != 0)
     return emitOpError() << "does not yield a value to be returned by parent";
 
   return success();
@@ -1562,6 +1562,76 @@ LogicalResult GetFieldOp::verifySymbolUses(SymbolTableCollection &symbolTable) {
 }
 
 //===----------------------------------------------------------------------===//
+// DoOp
+//===----------------------------------------------------------------------===//
+
+void DoOp::print(OpAsmPrinter &p) {
+  p << ' ';
+  p.printRegion(getBodyRegion(), /*printEntryBlockArgs=*/false);
+  p << " while ";
+  p.printRegion(getConditionRegion());
+  p.printOptionalAttrDictWithKeyword(getOperation()->getAttrs());
+}
+
+LogicalResult emitc::DoOp::verify() {
+  Block &condBlock = getConditionRegion().front();
+
+  if (condBlock.getOperations().size() != 2)
+    return emitOpError(
+               "condition region must contain exactly two operations: "
+               "'emitc.expression' followed by 'emitc.yield', but found ")
+           << condBlock.getOperations().size() << " operations";
+
+  Operation &first = condBlock.front();
+  auto exprOp = dyn_cast<emitc::ExpressionOp>(first);
+  if (!exprOp)
+    return emitOpError("expected first op in condition region to be "
+                       "'emitc.expression', but got ")
+           << first.getName();
+
+  if (!exprOp.getResult().getType().isInteger(1))
+    return emitOpError("emitc.expression in condition region must return "
+                       "'i1', but returns ")
+           << exprOp.getResult().getType();
+
+  Operation &last = condBlock.back();
+  auto condYield = dyn_cast<emitc::YieldOp>(last);
+  if (!condYield)
+    return emitOpError("expected last op in condition region to be "
+                       "'emitc.yield', but got ")
+           << last.getName();
+
+  if (condYield.getNumOperands() != 1)
+    return emitOpError("expected condition region to return 1 value, but "
+                       "it returns ")
+           << condYield.getNumOperands() << " values";
+
+  if (condYield.getOperand(0) != exprOp.getResult())
+    return emitError("'emitc.yield' must return result of "
+                     "'emitc.expression' from this condition region");
+
+  Block &bodyBlock = getBodyRegion().front();
+  if (bodyBlock.mightHaveTerminator())
+    return emitOpError("body region must not contain terminator");
+
+  return success();
+}
+
+ParseResult DoOp::parse(OpAsmParser &parser, OperationState &result) {
+  Region *bodyRegion = result.addRegion();
+  Region *condRegion = result.addRegion();
+
+  if (parser.parseRegion(*bodyRegion) || parser.parseKeyword("while") ||
+      parser.parseRegion(*condRegion))
+    return failure();
+
+  if (bodyRegion->empty())
+    bodyRegion->emplaceBlock();
+
+  return parser.parseOptionalAttrDictWithKeyword(result.attributes);
+}
+
+//===----------------------------------------------------------------------===//
 // TableGen'd op method definitions
 //===----------------------------------------------------------------------===//
 
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp b/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp
index 7863c21..0dac688 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp
@@ -1146,37 +1146,25 @@ LogicalResult DecomposeOuterUnitDimsPackOpPattern::matchAndRewrite(
   Attribute oneIdxAttr = rewriter.getIndexAttr(1);
   Location loc = packOp.getLoc();
 
-  Value input = getPackOpSourceOrPaddedSource(rewriter, packOp);
-  DenseMap<int64_t, OpFoldResult> dimAndTileMapping =
-      packOp.getDimAndTileMapping();
   int64_t srcRank = packOp.getSourceRank();
   int64_t destRank = packOp.getDestRank();
-  int64_t numTiles = destRank - srcRank;
+  ArrayRef<int64_t> innerDimsPos = packOp.getInnerDimsPos();
+  int64_t numberOfTiles = innerDimsPos.size();
 
-  // 1. Extract the inner tile sizes.
-  // Where possible, values are replaced with constant attributes (to match the
-  // behaviour of `getPackOpSourceOrPaddedSource`).
-  SmallVector<OpFoldResult> tileSizes;
-  for (auto i : llvm::seq<unsigned>(0, srcRank)) {
-    if (dimAndTileMapping.count(i)) {
-      // Rather than taking the tile size as is, extact the actual constant
-      // value Attribute where possible, e.g.:
-      //    [Value: %tile_size = arith.constant 8 : index] --> [Attribute: 8]
-      auto [_, tileSize] =
-          getSimplifiedOfrAndStaticSizePair(dimAndTileMapping[i], rewriter);
-      tileSizes.push_back(tileSize);
-    }
-  }
+  // 1. Get the input that is going to be packed. If the input requires padding,
+  // add a padding operation and return that as the input.
+  Value input = getPackOpSourceOrPaddedSource(rewriter, packOp);
 
   // 2. Transpose the input to match the inner tile order:
   //    %init = tensor.empty()
   //    %transposed_tile = linalg.transpose ins(%source_or_padded_source),
   //                                        outs(%init)
   // Assumptions made:
-  //  1. All outer dims are 1 - the corresponding transposition order doesn't
+  //  - All outer dims are 1 - the corresponding transposition order doesn't
   //     matter, but requires all dim indices to be present.
+
+  // 2.1 Get the permutation for linalg.transpose
   SmallVector<int64_t> srcPermForTranspose;
-  ArrayRef<int64_t> innerDimPos(packOp.getInnerDimsPos());
   for (int64_t i = 0; i < srcRank; i++) {
     // We assume the `k` dimensions of the inner dim position, where `k` is the
     // rank of the inner tiling, correspond to the last `k` indices of the
@@ -1185,27 +1173,34 @@ LogicalResult DecomposeOuterUnitDimsPackOpPattern::matchAndRewrite(
     // rank of the source tensor. For example if we have a source tensor with
     // indices [0, 1, 2, 3] and inner dim position of [3, 0], the remaining
     // indices are [1, 2]. and the transpose will be [1, 2, 3, 0].
-    if (llvm::is_contained(innerDimPos, i))
+    if (llvm::is_contained(innerDimsPos, i))
       continue;
     srcPermForTranspose.push_back(i);
   }
-  srcPermForTranspose.append(innerDimPos.begin(), innerDimPos.end());
+  srcPermForTranspose.append(innerDimsPos.begin(), innerDimsPos.end());
+
+  // 2.2 Create the init tensor for linalg.transpose with the correct shape
+  SmallVector<OpFoldResult> shapeForEmptyOp(srcRank - numberOfTiles,
+                                            oneIdxAttr);
+  shapeForEmptyOp.append(packOp.getMixedTiles());
+
+  // getMixedTiles() may contain Values pointing to constant ops, not the
+  // constant attributes. Replace them with a true OpFoldResult.
+  llvm::transform(shapeForEmptyOp, shapeForEmptyOp.begin(),
+                  [&](OpFoldResult ofr) {
+                    if (auto val = llvm::dyn_cast<Value>(ofr))
+                      return getAsOpFoldResult(val);
+                    return ofr;
+                  });
 
   LDBG() << "Pack permutation: " << packOp;
   LDBG() << "perm: " << llvm::interleaved(srcPermForTranspose);
+  LDBG() << "Shape of empty tensor: " << llvm::interleaved(shapeForEmptyOp);
 
-  // 2.1 Create tensor.empty (init value for TransposeOp)
-  SmallVector<OpFoldResult> transShapeForEmptyOp(srcRank - numTiles,
-                                                 oneIdxAttr);
-  transShapeForEmptyOp.append(tileSizes);
-
-  applyPermutationToVector<OpFoldResult>(transShapeForEmptyOp,
-                                         srcPermForTranspose);
-  Value empty =
-      tensor::EmptyOp::create(rewriter, loc, transShapeForEmptyOp,
-                              packOp.getSourceType().getElementType());
+  Value empty = tensor::EmptyOp::create(
+      rewriter, loc, shapeForEmptyOp, packOp.getSourceType().getElementType());
 
-  // 2.2 Create linalg.transpose
+  // 2.3 Create linalg.transpose
   auto transposedOp = linalg::TransposeOp::create(rewriter, loc, input, empty,
                                                   srcPermForTranspose);
 
@@ -1214,8 +1209,7 @@ LogicalResult DecomposeOuterUnitDimsPackOpPattern::matchAndRewrite(
   SmallVector<OpFoldResult> writeStrides(destRank, oneIdxAttr);
   SmallVector<OpFoldResult> writeOffsets(destRank, zeroIdxAttr);
   // Outer dims are all 1s!
-  SmallVector<OpFoldResult> writeSizes(destRank - dimAndTileMapping.size(),
-                                       oneIdxAttr);
+  SmallVector<OpFoldResult> writeSizes(destRank - numberOfTiles, oneIdxAttr);
   SmallVector<int64_t> writeShape;
 
   for (auto tileSize : packOp.getMixedTiles()) {
diff --git a/mlir/lib/Dialect/NVGPU/IR/NVGPUDialect.cpp b/mlir/lib/Dialect/NVGPU/IR/NVGPUDialect.cpp
index 697cb35..237aab4 100644
--- a/mlir/lib/Dialect/NVGPU/IR/NVGPUDialect.cpp
+++ b/mlir/lib/Dialect/NVGPU/IR/NVGPUDialect.cpp
@@ -27,7 +27,7 @@ using namespace mlir::nvgpu;
 
 #include "mlir/Dialect/NVGPU/IR/NVGPUDialect.cpp.inc"
 
-void nvgpu::NVGPUDialect::initialize() {
+void NVGPUDialect::initialize() {
   addTypes<
 #define GET_TYPEDEF_LIST
 #include "mlir/Dialect/NVGPU/IR/NVGPUTypeDefs.cpp.inc"
@@ -42,7 +42,7 @@ void nvgpu::NVGPUDialect::initialize() {
       >();
 }
 
-bool nvgpu::NVGPUDialect::isSharedMemoryAddressSpace(Attribute memorySpace) {
+bool NVGPUDialect::isSharedMemoryAddressSpace(Attribute memorySpace) {
   if (!memorySpace)
     return false;
   if (auto intAttr = llvm::dyn_cast<IntegerAttr>(memorySpace))
@@ -52,7 +52,7 @@ bool nvgpu::NVGPUDialect::isSharedMemoryAddressSpace(Attribute memorySpace) {
   return false;
 }
 
-bool nvgpu::NVGPUDialect::hasSharedMemoryAddressSpace(MemRefType type) {
+bool NVGPUDialect::hasSharedMemoryAddressSpace(MemRefType type) {
   Attribute memorySpace = type.getMemorySpace();
   return isSharedMemoryAddressSpace(memorySpace);
 }
@@ -140,7 +140,6 @@ static LogicalResult verifyMmaSyncOp(Operation *op,
                                      TypedValue<VectorType> matrixC,
                                      const std::array<int64_t, 3> &mmaShape,
                                      bool tf32Enabled, bool sparse = false) {
-
   // The verification for mma.sync covering various shapes and data types is
   // based on the fundamental tensor core shape.
 
@@ -292,7 +291,6 @@ LogicalResult MmaSparseSyncOp::verify() {
 // NVGPU_LdMatrixOp
 //===----------------------------------------------------------------------===//
 LogicalResult LdMatrixOp::verify() {
-
   // ldmatrix reads data from source in shared memory
   auto srcMemref = llvm::cast<MemRefType>(getSrcMemref().getType());
 
@@ -345,7 +343,7 @@ LogicalResult LdMatrixOp::verify() {
 // NVGPU_TmaAsyncLoadOp
 //===----------------------------------------------------------------------===//
 
-unsigned getSwizzleBytes(TensorMapSwizzleKind kind) {
+static unsigned getSwizzleBytes(TensorMapSwizzleKind kind) {
   switch (kind) {
   case TensorMapSwizzleKind::SWIZZLE_32B:
     return 32;
@@ -359,7 +357,7 @@ unsigned getSwizzleBytes(TensorMapSwizzleKind kind) {
 }
 
 std::optional<InFlightDiagnostic> verifyTmaDescriptorWithMemref(
-    Operation *op, nvgpu::TensorMapDescriptorType descType,
+    Operation *op, TensorMapDescriptorType descType,
     std::optional<MemRefType> memrefType = std::nullopt) {
   MemRefType descMemref = descType.getTensor();
   // Limitation
@@ -655,8 +653,7 @@ LogicalResult WarpgroupMmaStoreOp::verify() {
 //===----------------------------------------------------------------------===//
 
 LogicalResult WarpgroupMmaInitAccumulatorOp::verify() {
-
-  nvgpu::WarpgroupAccumulatorType accType = getMatrixC().getType();
+  WarpgroupAccumulatorType accType = getMatrixC().getType();
   int64_t sizeM = accType.getFragmented().getDimSize(0);
   int64_t sizeN = accType.getFragmented().getDimSize(1);
   Type elemType = accType.getFragmented().getElementType();
diff --git a/mlir/lib/Dialect/NVGPU/TransformOps/NVGPUTransformOps.cpp b/mlir/lib/Dialect/NVGPU/TransformOps/NVGPUTransformOps.cpp
index 46e82bd..2a857ed 100644
--- a/mlir/lib/Dialect/NVGPU/TransformOps/NVGPUTransformOps.cpp
+++ b/mlir/lib/Dialect/NVGPU/TransformOps/NVGPUTransformOps.cpp
@@ -43,7 +43,7 @@ using namespace mlir::transform;
 // Apply...ConversionPatternsOp
 //===----------------------------------------------------------------------===//
 
-void transform::ApplyNVGPUToNVVMConversionPatternsOp::populatePatterns(
+void ApplyNVGPUToNVVMConversionPatternsOp::populatePatterns(
     TypeConverter &typeConverter, RewritePatternSet &patterns) {
   auto &llvmTypeConverter = static_cast<LLVMTypeConverter &>(typeConverter);
   /// device-side async tokens cannot be materialized in nvvm. We just
@@ -62,62 +62,58 @@ void transform::ApplyNVGPUToNVVMConversionPatternsOp::populatePatterns(
         llvm_unreachable("unknown address space enum value");
         return static_cast<unsigned>(NVVM::NVVMMemorySpace::Generic);
       });
-  llvmTypeConverter.addConversion(
-      [&](nvgpu::DeviceAsyncTokenType type) -> Type {
-        return llvmTypeConverter.convertType(
-            IntegerType::get(type.getContext(), 32));
-      });
-  llvmTypeConverter.addConversion([&](nvgpu::MBarrierTokenType type) -> Type {
+  llvmTypeConverter.addConversion([&](DeviceAsyncTokenType type) -> Type {
+    return llvmTypeConverter.convertType(
+        IntegerType::get(type.getContext(), 32));
+  });
+  llvmTypeConverter.addConversion([&](MBarrierTokenType type) -> Type {
     return llvmTypeConverter.convertType(
         IntegerType::get(type.getContext(), 64));
   });
-  llvmTypeConverter.addConversion(
-      [&](nvgpu::WarpgroupAccumulatorType type) -> Type {
-        Type elemType = type.getFragmented().getElementType();
-        int64_t sizeM = type.getFragmented().getDimSize(0);
-        int64_t sizeN = type.getFragmented().getDimSize(1);
-
-        unsigned numMembers;
-        if (elemType.isF32() || elemType.isInteger(32))
-          numMembers = sizeN / 2;
-        else if (elemType.isF16())
-          numMembers = sizeN / 4;
-        else
-          llvm_unreachable("unsupported type for warpgroup accumulator");
-
-        SmallVector<Type> innerStructBody;
-        for (unsigned i = 0; i < numMembers; i++)
-          innerStructBody.push_back(elemType);
-        auto innerStructType = LLVM::LLVMStructType::getLiteral(
-            type.getContext(), innerStructBody);
-
-        SmallVector<Type> structBody;
-        for (int i = 0; i < sizeM; i += kWgmmaSizeM)
-          structBody.push_back(innerStructType);
-
-        auto convertedType =
-            LLVM::LLVMStructType::getLiteral(type.getContext(), structBody);
-        return llvmTypeConverter.convertType(convertedType);
-      });
-  llvmTypeConverter.addConversion([&](nvgpu::MBarrierGroupType type) -> Type {
+  llvmTypeConverter.addConversion([&](WarpgroupAccumulatorType type) -> Type {
+    Type elemType = type.getFragmented().getElementType();
+    int64_t sizeM = type.getFragmented().getDimSize(0);
+    int64_t sizeN = type.getFragmented().getDimSize(1);
+
+    unsigned numMembers;
+    if (elemType.isF32() || elemType.isInteger(32))
+      numMembers = sizeN / 2;
+    else if (elemType.isF16())
+      numMembers = sizeN / 4;
+    else
+      llvm_unreachable("unsupported type for warpgroup accumulator");
+
+    SmallVector<Type> innerStructBody;
+    for (unsigned i = 0; i < numMembers; i++)
+      innerStructBody.push_back(elemType);
+    auto innerStructType =
+        LLVM::LLVMStructType::getLiteral(type.getContext(), innerStructBody);
+
+    SmallVector<Type> structBody;
+    for (int i = 0; i < sizeM; i += kWgmmaSizeM)
+      structBody.push_back(innerStructType);
+
+    auto convertedType =
+        LLVM::LLVMStructType::getLiteral(type.getContext(), structBody);
+    return llvmTypeConverter.convertType(convertedType);
+  });
+  llvmTypeConverter.addConversion([&](MBarrierGroupType type) -> Type {
     return llvmTypeConverter.convertType(
         getMBarrierMemrefType(type.getContext(), type));
   });
   llvmTypeConverter.addConversion(
-      [&](nvgpu::WarpgroupMatrixDescriptorType type) -> Type {
+      [&](WarpgroupMatrixDescriptorType type) -> Type {
         return llvmTypeConverter.convertType(
             IntegerType::get(type.getContext(), 64));
       });
-  llvmTypeConverter.addConversion(
-      [&](nvgpu::TensorMapDescriptorType type) -> Type {
-        return LLVM::LLVMPointerType::get(type.getContext());
-      });
+  llvmTypeConverter.addConversion([&](TensorMapDescriptorType type) -> Type {
+    return LLVM::LLVMPointerType::get(type.getContext());
+  });
   populateNVGPUToNVVMConversionPatterns(llvmTypeConverter, patterns);
 }
 
-LogicalResult
-transform::ApplyNVGPUToNVVMConversionPatternsOp::verifyTypeConverter(
-    transform::TypeConverterBuilderOpInterface builder) {
+LogicalResult ApplyNVGPUToNVVMConversionPatternsOp::verifyTypeConverter(
+    TypeConverterBuilderOpInterface builder) {
   if (builder.getTypeConverterType() != "LLVMTypeConverter")
     return emitOpError("expected LLVMTypeConverter");
   return success();
@@ -127,17 +123,18 @@ transform::ApplyNVGPUToNVVMConversionPatternsOp::verifyTypeConverter(
 // CreateAsyncGroupsOp
 //===---------------------------------------------------------------------===//
 
-void transform::CreateAsyncGroupsOp::getEffects(
+void CreateAsyncGroupsOp::getEffects(
     SmallVectorImpl<MemoryEffects::EffectInstance> &effects) {
-  transform::consumesHandle(getTargetMutable(), effects);
-  transform::producesHandle(getOperation()->getOpResults(), effects);
-  transform::modifiesPayload(effects);
+  consumesHandle(getTargetMutable(), effects);
+  producesHandle(getOperation()->getOpResults(), effects);
+  modifiesPayload(effects);
 }
 
-DiagnosedSilenceableFailure transform::CreateAsyncGroupsOp::applyToOne(
-    TransformRewriter &rewriter, Operation *target,
-    ApplyToEachResultList &results, TransformState &state) {
-  nvgpu::createAsyncGroups(rewriter, target, getBypassL1());
+DiagnosedSilenceableFailure
+CreateAsyncGroupsOp::applyToOne(TransformRewriter &rewriter, Operation *target,
+                                ApplyToEachResultList &results,
+                                TransformState &state) {
+  createAsyncGroups(rewriter, target, getBypassL1());
   results.push_back(target);
   return DiagnosedSilenceableFailure::success();
 }
@@ -218,7 +215,7 @@ collectStage0PipeliningOps(scf::ForOp forOp,
       continue;
     }
 
-    if (isa<nvgpu::DeviceAsyncCopyOp, nvgpu::DeviceAsyncCreateGroupOp>(op)) {
+    if (isa<DeviceAsyncCopyOp, DeviceAsyncCreateGroupOp>(op)) {
       ops.insert(&op);
       ops.insert(std::make_move_iterator(barriers.begin()),
                  std::make_move_iterator(barriers.end()));
@@ -246,7 +243,7 @@ setAsyncWaitGroupsInFlight(OpBuilder &builder, Operation *op,
                            unsigned iteration, unsigned depth) {
   // Based on the order of copies within the loop we need to set the number
   // of copies in flight, unless it is already set.
-  auto waitOp = dyn_cast<nvgpu::DeviceAsyncWaitOp>(op);
+  auto waitOp = dyn_cast<DeviceAsyncWaitOp>(op);
   if (!waitOp || waitOp.getNumGroups())
     return;
 
@@ -312,13 +309,12 @@ static Operation *replaceOpWithPredicatedOp(RewriterBase &rewriter,
   // original number of iterations, in particular side-effect free operations
   // and barriers, even if they cannot be predicated.
   if (isMemoryEffectFree(op) ||
-      isa<gpu::BarrierOp, nvgpu::DeviceAsyncCreateGroupOp,
-          nvgpu::DeviceAsyncWaitOp>(op)) {
+      isa<gpu::BarrierOp, DeviceAsyncCreateGroupOp, DeviceAsyncWaitOp>(op)) {
     return op;
   }
 
   // Otherwise, only async copies can currently be predicated.
-  auto asyncCopyOp = dyn_cast<nvgpu::DeviceAsyncCopyOp>(op);
+  auto asyncCopyOp = dyn_cast<DeviceAsyncCopyOp>(op);
   if (!asyncCopyOp)
     return nullptr;
 
@@ -335,8 +331,8 @@ static Operation *replaceOpWithPredicatedOp(RewriterBase &rewriter,
   Value c0Index = arith::ConstantIndexOp::create(rewriter, loc, 0);
   auto srcElements = arith::SelectOp::create(rewriter, loc, predicate,
                                              originalSrcElement, c0Index);
-  auto asyncCopyZeroFillOp = nvgpu::DeviceAsyncCopyOp::create(
-      rewriter, loc, nvgpu::DeviceAsyncTokenType::get(asyncCopyOp.getContext()),
+  auto asyncCopyZeroFillOp = DeviceAsyncCopyOp::create(
+      rewriter, loc, DeviceAsyncTokenType::get(asyncCopyOp.getContext()),
       asyncCopyOp.getDst(), asyncCopyOp.getDstIndices(), asyncCopyOp.getSrc(),
       asyncCopyOp.getSrcIndices(), asyncCopyOp.getDstElements(), srcElements,
       UnitAttr());
@@ -805,17 +801,16 @@ FailureOr<Operation *> MmaSyncBuilder::buildMmaSync(LinalgOp linalgOp) {
                                             rhsIndexFn, rhsShape);
   Value res = buildMmaSyncMemRefLoadOperand(b, loc, laneId, resMemRef,
                                             resIndexFn, resShape);
-  res = nvgpu::MmaSyncOp::create(b, loc, lhs, rhs, res, info.mmaShape,
-                                 info.tf32Enabled);
+  res =
+      MmaSyncOp::create(b, loc, lhs, rhs, res, info.mmaShape, info.tf32Enabled);
   buildMmaSyncMemRefStoreOperand(b, loc, res, laneId, resMemRef, resIndexFn,
                                  resShape);
   return res.getDefiningOp();
 }
 
-DiagnosedSilenceableFailure transform::RewriteMatmulAsMmaSyncOp::applyToOne(
-    transform::TransformRewriter &rewriter, LinalgOp linalgOp,
-    transform::ApplyToEachResultList &results,
-    transform::TransformState &state) {
+DiagnosedSilenceableFailure RewriteMatmulAsMmaSyncOp::applyToOne(
+    TransformRewriter &rewriter, LinalgOp linalgOp,
+    ApplyToEachResultList &results, TransformState &state) {
   bool fail = true;
   // TODO: more robust detection of matmulOp, with transposes etc.
   if (isa_and_nonnull<linalg::MatmulOp>(linalgOp.getOperation())) {
@@ -854,43 +849,42 @@ struct HopperBuilder {
   HopperBuilder(RewriterBase &rewriter, Location loc)
       : rewriter(rewriter), loc(loc) {}
 
-  TypedValue<nvgpu::MBarrierGroupType>
+  TypedValue<MBarrierGroupType>
   buildAndInitBarrierInSharedMemory(OpFoldResult numThreads);
 
   /// Create tma descriptor op to initiate transfer from global to shared
   /// memory. This must be done before the launch op, on the host.
-  TypedValue<nvgpu::TensorMapDescriptorType>
+  TypedValue<TensorMapDescriptorType>
   buildGlobalMemRefDescriptor(TypedValue<MemRefType> memref,
                               gpu::LaunchOp launchOp);
 
   /// Build a tma load from global memory to shared memory using `barrier` to
   /// synchronize. Return the number of bytes that will be transferred.
-  OpFoldResult
-  buildTmaAsyncLoad(TypedValue<nvgpu::TensorMapDescriptorType> globalDesc,
-                    TypedValue<MemRefType> sharedMemref,
-                    TypedValue<nvgpu::MBarrierGroupType> barrier,
-                    SmallVectorImpl<Operation *> &loadOps);
-  void buildBarrierArriveTx(TypedValue<nvgpu::MBarrierGroupType> barrier,
+  OpFoldResult buildTmaAsyncLoad(TypedValue<TensorMapDescriptorType> globalDesc,
+                                 TypedValue<MemRefType> sharedMemref,
+                                 TypedValue<MBarrierGroupType> barrier,
+                                 SmallVectorImpl<Operation *> &loadOps);
+  void buildBarrierArriveTx(TypedValue<MBarrierGroupType> barrier,
                             ArrayRef<OpFoldResult> sizes);
 
   /// If threadIdx.x == 0 does TMA request + wait, else just wait.
   /// Return the operation that performs the transfer on thread0.
   // TODO: In the future, don't hardcode to thread 0 but elect a leader.
   SmallVector<Operation *> buildPredicateLoadsOnThread0(
-      ArrayRef<TypedValue<nvgpu::TensorMapDescriptorType>> globalDescriptors,
+      ArrayRef<TypedValue<TensorMapDescriptorType>> globalDescriptors,
       ArrayRef<TypedValue<MemRefType>> sharedMemBuffers,
-      TypedValue<nvgpu::MBarrierGroupType> barrier);
+      TypedValue<MBarrierGroupType> barrier);
 
-  void buildTryWaitParity(TypedValue<nvgpu::MBarrierGroupType> barrier);
+  void buildTryWaitParity(TypedValue<MBarrierGroupType> barrier);
 
   RewriterBase &rewriter;
   Location loc;
 };
 
 SmallVector<Operation *> HopperBuilder::buildPredicateLoadsOnThread0(
-    ArrayRef<TypedValue<nvgpu::TensorMapDescriptorType>> globalDescriptors,
+    ArrayRef<TypedValue<TensorMapDescriptorType>> globalDescriptors,
     ArrayRef<TypedValue<MemRefType>> sharedMemBuffers,
-    TypedValue<nvgpu::MBarrierGroupType> barrier) {
+    TypedValue<MBarrierGroupType> barrier) {
   SmallVector<Operation *> loadOps;
   Value zero = arith::ConstantIndexOp::create(rewriter, loc, 0);
   Value tidx = gpu::ThreadIdOp::create(rewriter, loc, gpu::Dimension::x);
@@ -931,22 +925,22 @@ static Attribute getSharedAddressSpaceAttribute(OpBuilder &b) {
   // return b.getI64IntegerAttr(static_cast<int64_t>(kSharedMemorySpace));
 }
 
-TypedValue<nvgpu::MBarrierGroupType>
+TypedValue<MBarrierGroupType>
 HopperBuilder::buildAndInitBarrierInSharedMemory(OpFoldResult numThreads) {
   auto sharedMemorySpace = getSharedAddressSpaceAttribute(rewriter);
-  Value barrier = nvgpu::MBarrierCreateOp::create(
+  Value barrier = MBarrierCreateOp::create(
       rewriter, loc,
-      nvgpu::MBarrierGroupType::get(rewriter.getContext(), sharedMemorySpace));
+      MBarrierGroupType::get(rewriter.getContext(), sharedMemorySpace));
   Value zero = arith::ConstantIndexOp::create(rewriter, loc, 0);
   nvgpu::MBarrierInitOp::create(
       rewriter, loc, barrier,
       getValueOrCreateConstantIndexOp(rewriter, loc, numThreads), zero,
       Value());
   gpu::BarrierOp::create(rewriter, loc);
-  return cast<TypedValue<nvgpu::MBarrierGroupType>>(barrier);
+  return cast<TypedValue<MBarrierGroupType>>(barrier);
 }
 
-TypedValue<nvgpu::TensorMapDescriptorType>
+TypedValue<TensorMapDescriptorType>
 HopperBuilder::buildGlobalMemRefDescriptor(TypedValue<MemRefType> memref,
                                            gpu::LaunchOp launchOp) {
   OpBuilder::InsertionGuard guard(rewriter);
@@ -962,29 +956,29 @@ HopperBuilder::buildGlobalMemRefDescriptor(TypedValue<MemRefType> memref,
       getValueOrCreateConstantIndexOp(rewriter, loc, mixedSizes);
 
   auto sharedMemorySpace = getSharedAddressSpaceAttribute(rewriter);
-  Value desc = nvgpu::TmaCreateDescriptorOp::create(
+  Value desc = TmaCreateDescriptorOp::create(
       rewriter, loc,
-      nvgpu::TensorMapDescriptorType::get(
-          rewriter.getContext(),
-          MemRefType::Builder(memref.getType())
-              .setMemorySpace(sharedMemorySpace),
-          TensorMapSwizzleKind::SWIZZLE_NONE,
-          TensorMapL2PromoKind::L2PROMO_NONE, TensorMapOOBKind::OOB_ZERO,
-          TensorMapInterleaveKind::INTERLEAVE_NONE),
+      TensorMapDescriptorType::get(rewriter.getContext(),
+                                   MemRefType::Builder(memref.getType())
+                                       .setMemorySpace(sharedMemorySpace),
+                                   TensorMapSwizzleKind::SWIZZLE_NONE,
+                                   TensorMapL2PromoKind::L2PROMO_NONE,
+                                   TensorMapOOBKind::OOB_ZERO,
+                                   TensorMapInterleaveKind::INTERLEAVE_NONE),
       unrankedMemRef, sizes);
-  return cast<TypedValue<nvgpu::TensorMapDescriptorType>>(desc);
+  return cast<TypedValue<TensorMapDescriptorType>>(desc);
 }
 
-OpFoldResult HopperBuilder::buildTmaAsyncLoad(
-    TypedValue<nvgpu::TensorMapDescriptorType> globalDesc,
-    TypedValue<MemRefType> sharedMemref,
-    TypedValue<nvgpu::MBarrierGroupType> barrier,
-    SmallVectorImpl<Operation *> &loadOps) {
+OpFoldResult
+HopperBuilder::buildTmaAsyncLoad(TypedValue<TensorMapDescriptorType> globalDesc,
+                                 TypedValue<MemRefType> sharedMemref,
+                                 TypedValue<MBarrierGroupType> barrier,
+                                 SmallVectorImpl<Operation *> &loadOps) {
   MLIRContext *ctx = rewriter.getContext();
   Value zero = arith::ConstantIndexOp::create(rewriter, loc, 0);
-  Operation *loadOp = nvgpu::TmaAsyncLoadOp::create(
-      rewriter, loc, sharedMemref, barrier, globalDesc, ValueRange{zero, zero},
-      zero, Value(), Value());
+  Operation *loadOp =
+      TmaAsyncLoadOp::create(rewriter, loc, sharedMemref, barrier, globalDesc,
+                             ValueRange{zero, zero}, zero, Value(), Value());
   loadOps.push_back(loadOp);
   auto mixedSizes = memref::getMixedSizes(rewriter, loc, sharedMemref);
   SmallVector<AffineExpr> symbols(mixedSizes.size());
@@ -997,9 +991,8 @@ OpFoldResult HopperBuilder::buildTmaAsyncLoad(
   return res;
 }
 
-void HopperBuilder::buildBarrierArriveTx(
-    TypedValue<nvgpu::MBarrierGroupType> barrier,
-    ArrayRef<OpFoldResult> mixedSizes) {
+void HopperBuilder::buildBarrierArriveTx(TypedValue<MBarrierGroupType> barrier,
+                                         ArrayRef<OpFoldResult> mixedSizes) {
   assert(!mixedSizes.empty() && "expecte non-empty sizes");
   MLIRContext *ctx = rewriter.getContext();
   SmallVector<AffineExpr> symbols(mixedSizes.size());
@@ -1013,8 +1006,7 @@ void HopperBuilder::buildBarrierArriveTx(
                                           Value());
 }
 
-void HopperBuilder::buildTryWaitParity(
-    TypedValue<nvgpu::MBarrierGroupType> barrier) {
+void HopperBuilder::buildTryWaitParity(TypedValue<MBarrierGroupType> barrier) {
   Type i1 = rewriter.getI1Type();
   Value parity = LLVM::ConstantOp::create(rewriter, loc, i1, 0);
   // 10M is an arbitrary, not too small or too big number to specify the number
@@ -1058,11 +1050,11 @@ SmallVector<Operation *> CopyBuilder::rewrite(ArrayRef<Operation *> copyOps) {
       ArrayRef<OpFoldResult>{launchOp.getBlockSizeX(), launchOp.getBlockSizeY(),
                              launchOp.getBlockSizeZ()});
 
-  TypedValue<nvgpu::MBarrierGroupType> barrier =
+  TypedValue<MBarrierGroupType> barrier =
       buildAndInitBarrierInSharedMemory(numThreads);
 
   SmallVector<TypedValue<MemRefType>> shmems;
-  SmallVector<TypedValue<nvgpu::TensorMapDescriptorType>> globalDescs;
+  SmallVector<TypedValue<TensorMapDescriptorType>> globalDescs;
   for (Operation *op : copyOps) {
     auto copyOp = cast<linalg::CopyOp>(op);
     auto inMemRef =
@@ -1071,7 +1063,7 @@ SmallVector<Operation *> CopyBuilder::rewrite(ArrayRef<Operation *> copyOps) {
            "expected in to be a 2D memref");
 
     // 2. Build global memory descriptor.
-    TypedValue<nvgpu::TensorMapDescriptorType> globalDesc =
+    TypedValue<TensorMapDescriptorType> globalDesc =
         buildGlobalMemRefDescriptor(inMemRef, launchOp);
     globalDescs.push_back(globalDesc);
 
@@ -1098,9 +1090,8 @@ SmallVector<Operation *> CopyBuilder::rewrite(ArrayRef<Operation *> copyOps) {
 }
 
 DiagnosedSilenceableFailure
-transform::RewriteCopyAsTmaOp::apply(transform::TransformRewriter &rewriter,
-                                     transform::TransformResults &results,
-                                     transform::TransformState &state) {
+RewriteCopyAsTmaOp::apply(TransformRewriter &rewriter,
+                          TransformResults &results, TransformState &state) {
   auto payloadOps = state.getPayloadOps(getTarget());
   gpu::LaunchOp commonLaunchOp;
   Operation *firstOp, *failingOp;
@@ -1137,15 +1128,14 @@ transform::RewriteCopyAsTmaOp::apply(transform::TransformRewriter &rewriter,
 
 namespace {
 class NVGPUTransformDialectExtension
-    : public transform::TransformDialectExtension<
-          NVGPUTransformDialectExtension> {
+    : public TransformDialectExtension<NVGPUTransformDialectExtension> {
 public:
   MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(NVGPUTransformDialectExtension)
 
   NVGPUTransformDialectExtension() {
     declareGeneratedDialect<arith::ArithDialect>();
     declareGeneratedDialect<affine::AffineDialect>();
-    declareGeneratedDialect<nvgpu::NVGPUDialect>();
+    declareGeneratedDialect<NVGPUDialect>();
     declareGeneratedDialect<NVVM::NVVMDialect>();
     declareGeneratedDialect<vector::VectorDialect>();
     registerTransformOps<
diff --git a/mlir/lib/Dialect/NVGPU/Transforms/MmaSyncTF32Transform.cpp b/mlir/lib/Dialect/NVGPU/Transforms/MmaSyncTF32Transform.cpp
index 5b89c87..7f626a6 100644
--- a/mlir/lib/Dialect/NVGPU/Transforms/MmaSyncTF32Transform.cpp
+++ b/mlir/lib/Dialect/NVGPU/Transforms/MmaSyncTF32Transform.cpp
@@ -64,6 +64,5 @@ private:
 
 void mlir::nvgpu::populateMmaSyncF32ToTF32Patterns(
     RewritePatternSet &patterns, nvgpu::MmaSyncF32Lowering precision) {
-
   patterns.add<MmaSyncF32ToTF32Pattern>(patterns.getContext(), precision);
 }
diff --git a/mlir/lib/Dialect/NVGPU/Utils/MMAUtils.cpp b/mlir/lib/Dialect/NVGPU/Utils/MMAUtils.cpp
index 809d634..9e5ea93 100644
--- a/mlir/lib/Dialect/NVGPU/Utils/MMAUtils.cpp
+++ b/mlir/lib/Dialect/NVGPU/Utils/MMAUtils.cpp
@@ -168,8 +168,7 @@ nvgpu::getLaneIdAndValueIdToOperandCoord(OpBuilder &builder, Location loc,
                                          const WarpMatrixInfo &fragmentType) {
   Type elementType = fragmentType.vectorType.getElementType();
   ArrayRef<int64_t> operandShape = fragmentType.vectorType.getShape();
-  FailureOr<nvgpu::FragmentElementInfo> regInfo =
-      getMmaSyncRegisterType(fragmentType);
+  FailureOr<FragmentElementInfo> regInfo = getMmaSyncRegisterType(fragmentType);
   if (failed(regInfo))
     return failure();
 
@@ -199,8 +198,8 @@ nvgpu::getLaneIdAndValueIdToOperandCoord(OpBuilder &builder, Location loc,
                       (logicalValueIdDim % elementsPerRegister)});
 }
 
-FailureOr<nvgpu::LdMatrixParams>
-nvgpu::getLdMatrixParams(const WarpMatrixInfo &type, bool transpose) {
+FailureOr<LdMatrixParams> nvgpu::getLdMatrixParams(const WarpMatrixInfo &type,
+                                                   bool transpose) {
   LdMatrixParams params;
   Type elType = type.vectorType.getElementType();
   params.fragmentType = type.vectorType;
diff --git a/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp b/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
index 6598ac1..6564a4e 100644
--- a/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
+++ b/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
@@ -7,6 +7,7 @@
 // =============================================================================
 
 #include "mlir/Dialect/OpenACC/OpenACC.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/Dialect/LLVMIR/LLVMTypes.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
@@ -44,6 +45,7 @@ struct MemRefPointerLikeModel
   Type getElementType(Type pointer) const {
     return cast<MemRefType>(pointer).getElementType();
   }
+
   mlir::acc::VariableTypeCategory
   getPointeeTypeCategory(Type pointer, TypedValue<PointerLikeType> varPtr,
                          Type varType) const {
@@ -70,6 +72,115 @@ struct MemRefPointerLikeModel
     assert(memrefTy.getRank() > 0 && "rank expected to be positive");
     return mlir::acc::VariableTypeCategory::array;
   }
+
+  mlir::Value genAllocate(Type pointer, OpBuilder &builder, Location loc,
+                          StringRef varName, Type varType,
+                          Value originalVar) const {
+    auto memrefTy = cast<MemRefType>(pointer);
+
+    // Check if this is a static memref (all dimensions are known) - if yes
+    // then we can generate an alloca operation.
+    if (memrefTy.hasStaticShape())
+      return memref::AllocaOp::create(builder, loc, memrefTy).getResult();
+
+    // For dynamic memrefs, extract sizes from the original variable if
+    // provided. Otherwise they cannot be handled.
+    if (originalVar && originalVar.getType() == memrefTy &&
+        memrefTy.hasRank()) {
+      SmallVector<Value> dynamicSizes;
+      for (int64_t i = 0; i < memrefTy.getRank(); ++i) {
+        if (memrefTy.isDynamicDim(i)) {
+          // Extract the size of dimension i from the original variable
+          auto indexValue = arith::ConstantIndexOp::create(builder, loc, i);
+          auto dimSize =
+              memref::DimOp::create(builder, loc, originalVar, indexValue);
+          dynamicSizes.push_back(dimSize);
+        }
+        // Note: We only add dynamic sizes to the dynamicSizes array
+        // Static dimensions are handled automatically by AllocOp
+      }
+      return memref::AllocOp::create(builder, loc, memrefTy, dynamicSizes)
+          .getResult();
+    }
+
+    // TODO: Unranked not yet supported.
+    return {};
+  }
+
+  bool genFree(Type pointer, OpBuilder &builder, Location loc,
+               TypedValue<PointerLikeType> varPtr, Type varType) const {
+    if (auto memrefValue = dyn_cast<TypedValue<MemRefType>>(varPtr)) {
+      // Walk through casts to find the original allocation
+      Value currentValue = memrefValue;
+      Operation *originalAlloc = nullptr;
+
+      // Follow the chain of operations to find the original allocation
+      // even if a casted result is provided.
+      while (currentValue) {
+        if (auto *definingOp = currentValue.getDefiningOp()) {
+          // Check if this is an allocation operation
+          if (isa<memref::AllocOp, memref::AllocaOp>(definingOp)) {
+            originalAlloc = definingOp;
+            break;
+          }
+
+          // Check if this is a cast operation we can look through
+          if (auto castOp = dyn_cast<memref::CastOp>(definingOp)) {
+            currentValue = castOp.getSource();
+            continue;
+          }
+
+          // Check for other cast-like operations
+          if (auto reinterpretCastOp =
+                  dyn_cast<memref::ReinterpretCastOp>(definingOp)) {
+            currentValue = reinterpretCastOp.getSource();
+            continue;
+          }
+
+          // If we can't look through this operation, stop
+          break;
+        }
+        // This is a block argument or similar - can't trace further.
+        break;
+      }
+
+      if (originalAlloc) {
+        if (isa<memref::AllocaOp>(originalAlloc)) {
+          // This is an alloca - no dealloc needed, but return true (success)
+          return true;
+        }
+        if (isa<memref::AllocOp>(originalAlloc)) {
+          // This is an alloc - generate dealloc
+          memref::DeallocOp::create(builder, loc, memrefValue);
+          return true;
+        }
+      }
+    }
+
+    return false;
+  }
+
+  bool genCopy(Type pointer, OpBuilder &builder, Location loc,
+               TypedValue<PointerLikeType> destination,
+               TypedValue<PointerLikeType> source, Type varType) const {
+    // Generate a copy operation between two memrefs
+    auto destMemref = dyn_cast_if_present<TypedValue<MemRefType>>(destination);
+    auto srcMemref = dyn_cast_if_present<TypedValue<MemRefType>>(source);
+
+    // As per memref documentation, source and destination must have same
+    // element type and shape in order to be compatible. We do not want to fail
+    // with an IR verification error - thus check that before generating the
+    // copy operation.
+    if (destMemref && srcMemref &&
+        destMemref.getType().getElementType() ==
+            srcMemref.getType().getElementType() &&
+        destMemref.getType().getShape() == srcMemref.getType().getShape()) {
+      memref::CopyOp::create(builder, loc, srcMemref, destMemref);
+      return true;
+    }
+
+    return false;
+  }
 };
 
 struct LLVMPointerPointerLikeModel
diff --git a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
index b0132e8..14e235f 100644
--- a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
+++ b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
@@ -47,6 +47,7 @@
 
 #include <cassert>
 #include <cstdint>
+#include <numeric>
 
 #include "mlir/Dialect/Vector/IR/VectorDialect.cpp.inc"
 // Pull in all enum type and utility function definitions.
@@ -2412,9 +2413,38 @@ foldToElementsFromElements(ToElementsOp toElementsOp,
   return success();
 }
 
+/// Folds vector.to_elements(vector.broadcast(%x)) for the scalar case only.
+///
+/// Example:
+///  %b = vector.broadcast %x : i32 to vector<3xf32>
+///  %e:3 = vector.to_elements %b : vector<3xf32>
+///  user_op %e#0, %e#1, %e#2
+/// becomes:
+///  user_op %x, %x, %x
+///
+/// The vector source case is handled by a canonicalization pattern.
+static LogicalResult
+foldToElementsOfBroadcast(ToElementsOp toElementsOp,
+                          SmallVectorImpl<OpFoldResult> &results) {
+  auto bcastOp = toElementsOp.getSource().getDefiningOp<BroadcastOp>();
+  if (!bcastOp)
+    return failure();
+  // Vectors are handled in the ToElementsOfBroadcast RewritePattern.
+  if (isa<VectorType>(bcastOp.getSource().getType()))
+    return failure();
+
+  auto resultVecType = cast<VectorType>(toElementsOp.getSource().getType());
+
+  Value scalar = bcastOp.getSource();
+  results.assign(resultVecType.getNumElements(), scalar);
+  return success();
+}
+
 LogicalResult ToElementsOp::fold(FoldAdaptor adaptor,
                                  SmallVectorImpl<OpFoldResult> &results) {
-  return foldToElementsFromElements(*this, results);
+  if (succeeded(foldToElementsFromElements(*this, results)))
+    return success();
+  return foldToElementsOfBroadcast(*this, results);
 }
 
 LogicalResult
@@ -2427,6 +2457,94 @@ ToElementsOp::inferReturnTypes(MLIRContext *ctx, std::optional<Location> loc,
   return success();
 }
 
+/// Canonicalize `vector.to_elements(vector.broadcast(%v))` where `%v` is a
+/// vector.
+/// - Build `vector.to_elements %v` and remap each destination element to the
+///   corresponding source element using broadcast rules (match or 1 →
+///   replicate).
+///
+/// Example:
+///   %v = vector.broadcast %src : vector<2xf32> to vector<3x2xf32>
+///   %e:6 = vector.to_elements %v : vector<3x2xf32>
+/// becomes:
+///   %src_elems:2 = vector.to_elements %src : vector<2xf32>
+///   // uses: %src_elems#0, %src_elems#1, %src_elems#0,
+///   //       %src_elems#1, %src_elems#0, %src_elems#1
+struct ToElementsOfBroadcast final : OpRewritePattern<ToElementsOp> {
+  using Base::Base;
+
+  LogicalResult matchAndRewrite(ToElementsOp toElementsOp,
+                                PatternRewriter &rewriter) const override {
+    auto bcastOp = toElementsOp.getSource().getDefiningOp<BroadcastOp>();
+    if (!bcastOp)
+      return failure();
+
+    // Only handle broadcasts from a vector source here.
+    auto srcType = dyn_cast<VectorType>(bcastOp.getSource().getType());
+    if (!srcType)
+      return failure();
+
+    auto dstType = cast<VectorType>(toElementsOp.getSource().getType());
+
+    ArrayRef<int64_t> dstShape = dstType.getShape();
+    ArrayRef<int64_t> srcShape = srcType.getShape();
+
+    int64_t dstRank = dstShape.size();
+    int64_t srcRank = srcShape.size();
+
+    // Create elements for the broadcast source vector.
+    auto srcElems = vector::ToElementsOp::create(
+        rewriter, toElementsOp.getLoc(), bcastOp.getSource());
+
+    int64_t dstCount = std::accumulate(dstShape.begin(), dstShape.end(), 1,
+                                       std::multiplies<int64_t>());
+
+    SmallVector<Value> replacements;
+    replacements.reserve(dstCount);
+
+    // For each element of the destination, determine which element of the
+    // source should be used. We walk all destination positions using a single
+    // counter, decode it into per-dimension indices, then build the matching
+    // source position: use the same index where sizes match, and use 0 where
+    // the source size is 1 (replication). This mapping is needed so we can
+    // replace each result of to_elements with the corresponding element from
+    // the broadcast source.
+    // Inner-dimension stretch example:
+    //   %v = vector.broadcast %src : vector<2x1x2xf32> to vector<2x3x2xf32>
+    //   %e:12 = vector.to_elements %v : vector<2x3x2xf32>
+    // becomes:
+    //   %src_elems:4 = vector.to_elements %src : vector<2x1x2xf32>
+    //   // uses: %src_elems#0, %src_elems#1, %src_elems#0,
+    //   //       %src_elems#1, %src_elems#0, %src_elems#1,
+    //   //       %src_elems#2, %src_elems#3, %src_elems#2,
+    //   //       %src_elems#3, %src_elems#2, %src_elems#3
+
+    // Row-major strides for the destination shape.
+    SmallVector<int64_t> dstStrides = computeStrides(dstShape);
+    // Row-major strides for the source shape.
+    SmallVector<int64_t> srcStrides = computeStrides(srcShape);
+    SmallVector<int64_t> dstIdx(dstRank);
+    SmallVector<int64_t> srcIdx(srcRank);
+    for (int64_t lin = 0; lin < dstCount; ++lin) {
+      // Convert linear destination index to per-dimension indices.
+      dstIdx = delinearize(lin, dstStrides);
+      for (int64_t k = 0; k < srcRank; ++k)
+        srcIdx[k] = (srcShape[k] == 1) ? 0 : dstIdx[dstRank - srcRank + k];
+      // Convert per-dimension source indices back to a linear index.
+      int64_t srcLin = linearize(srcIdx, srcStrides);
+      replacements.push_back(srcElems.getResult(srcLin));
+    }
+
+    rewriter.replaceOp(toElementsOp, replacements);
+    return success();
+  }
+};
+
+void ToElementsOp::getCanonicalizationPatterns(RewritePatternSet &results,
+                                               MLIRContext *context) {
+  results.add<ToElementsOfBroadcast>(context);
+}
+
 //===----------------------------------------------------------------------===//
 // FromElementsOp
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Target/Cpp/TranslateToCpp.cpp b/mlir/lib/Target/Cpp/TranslateToCpp.cpp
index a5bd80e..5fe5f41 100644
--- a/mlir/lib/Target/Cpp/TranslateToCpp.cpp
+++ b/mlir/lib/Target/Cpp/TranslateToCpp.cpp
@@ -201,6 +201,8 @@ struct CppEmitter {
   /// Return the existing or a new label of a Block.
   StringRef getOrCreateName(Block &block);
 
+  LogicalResult emitInlinedExpression(Value value);
+
   /// Whether to map an mlir integer to a unsigned integer in C++.
   bool shouldMapToUnsigned(IntegerType::SignednessSemantics val);
 
@@ -557,6 +559,30 @@ static LogicalResult printOperation(CppEmitter &emitter,
   return success();
 }
 
+static LogicalResult printOperation(CppEmitter &emitter, emitc::DoOp doOp) {
+  raw_indented_ostream &os = emitter.ostream();
+
+  os << "do {\n";
+  os.indent();
+
+  Block &bodyBlock = doOp.getBodyRegion().front();
+  for (Operation &op : bodyBlock) {
+    if (failed(emitter.emitOperation(op, /*trailingSemicolon=*/true)))
+      return failure();
+  }
+
+  os.unindent() << "} while (";
+
+  Block &condBlock = doOp.getConditionRegion().front();
+  auto condYield = cast<emitc::YieldOp>(condBlock.back());
+  if (failed(emitter.emitExpression(
+          cast<emitc::ExpressionOp>(condYield.getOperand(0).getDefiningOp()))))
+    return failure();
+
+  os << ");";
+  return success();
+}
+
 static LogicalResult printOperation(CppEmitter &emitter, emitc::CmpOp cmpOp) {
   Operation *operation = cmpOp.getOperation();
 
@@ -1711,13 +1737,14 @@ LogicalResult CppEmitter::emitOperation(Operation &op, bool trailingSemicolon) {
                 emitc::BitwiseRightShiftOp, emitc::BitwiseXorOp, emitc::CallOp,
                 emitc::CallOpaqueOp, emitc::CastOp, emitc::ClassOp,
                 emitc::CmpOp, emitc::ConditionalOp, emitc::ConstantOp,
-                emitc::DeclareFuncOp, emitc::DivOp, emitc::ExpressionOp,
-                emitc::FieldOp, emitc::FileOp, emitc::ForOp, emitc::FuncOp,
-                emitc::GlobalOp, emitc::IfOp, emitc::IncludeOp, emitc::LoadOp,
-                emitc::LogicalAndOp, emitc::LogicalNotOp, emitc::LogicalOrOp,
-                emitc::MulOp, emitc::RemOp, emitc::ReturnOp, emitc::SubOp,
-                emitc::SwitchOp, emitc::UnaryMinusOp, emitc::UnaryPlusOp,
-                emitc::VariableOp, emitc::VerbatimOp>(
+                emitc::DeclareFuncOp, emitc::DivOp, emitc::DoOp,
+                emitc::ExpressionOp, emitc::FieldOp, emitc::FileOp,
+                emitc::ForOp, emitc::FuncOp, emitc::GlobalOp, emitc::IfOp,
+                emitc::IncludeOp, emitc::LoadOp, emitc::LogicalAndOp,
+                emitc::LogicalNotOp, emitc::LogicalOrOp, emitc::MulOp,
+                emitc::RemOp, emitc::ReturnOp, emitc::SubOp, emitc::SwitchOp,
+                emitc::UnaryMinusOp, emitc::UnaryPlusOp, emitc::VariableOp,
+                emitc::VerbatimOp>(
 
               [&](auto op) { return printOperation(*this, op); })
           // Func ops.
@@ -1765,9 +1792,9 @@ LogicalResult CppEmitter::emitOperation(Operation &op, bool trailingSemicolon) {
   // Never emit a semicolon for some operations, especially if endening with
   // `}`.
   trailingSemicolon &=
-      !isa<cf::CondBranchOp, emitc::DeclareFuncOp, emitc::FileOp, emitc::ForOp,
-           emitc::IfOp, emitc::IncludeOp, emitc::SwitchOp, emitc::VerbatimOp>(
-          op);
+      !isa<cf::CondBranchOp, emitc::DeclareFuncOp, emitc::DoOp, emitc::FileOp,
+           emitc::ForOp, emitc::IfOp, emitc::IncludeOp, emitc::SwitchOp,
+           emitc::VerbatimOp>(op);
 
   os << (trailingSemicolon ? ";\n" : "\n");
 
diff --git a/mlir/lib/Target/LLVMIR/DebugImporter.cpp b/mlir/lib/Target/LLVMIR/DebugImporter.cpp
index 8b03265..4bbcd8e 100644
--- a/mlir/lib/Target/LLVMIR/DebugImporter.cpp
+++ b/mlir/lib/Target/LLVMIR/DebugImporter.cpp
@@ -59,7 +59,8 @@ DICompileUnitAttr DebugImporter::translateImpl(llvm::DICompileUnit *node) {
           std::underlying_type_t<llvm::DICompileUnit::DebugNameTableKind>>(
           node->getNameTableKind()));
   return DICompileUnitAttr::get(
-      context, getOrCreateDistinctID(node), node->getSourceLanguage(),
+      context, getOrCreateDistinctID(node),
+      node->getSourceLanguage().getUnversionedName(),
       translate(node->getFile()), getStringAttrOrNull(node->getRawProducer()),
       node->isOptimized(), emissionKind.value(), nameTableKind.value(),
       getStringAttrOrNull(node->getRawSplitDebugFilename()));
diff --git a/mlir/test/Conversion/SCFToEmitC/while.mlir b/mlir/test/Conversion/SCFToEmitC/while.mlir
new file mode 100644
index 0000000..28524a0
--- /dev/null
+++ b/mlir/test/Conversion/SCFToEmitC/while.mlir
@@ -0,0 +1,293 @@
+// RUN: mlir-opt -allow-unregistered-dialect -convert-scf-to-emitc %s | FileCheck %s
+// RUN: mlir-opt -allow-unregistered-dialect -convert-to-emitc="filter-dialects=scf" %s | FileCheck %s
+
+emitc.func @payload_one_result(%arg: i32) -> i32 {
+  %result = add %arg, %arg : (i32, i32) -> i32
+  return %result : i32
+}
+
+func.func @one_result() -> i32 {
+  %init = emitc.literal "1.0" : i32
+  %var  = emitc.literal "1.0" : i32
+  %exit = emitc.literal "10.0" : i32
+
+  %res = scf.while (%arg1 = %init) : (i32) -> i32 {
+    %sum = emitc.add %arg1, %var : (i32, i32) -> i32
+    %condition = emitc.cmp lt, %sum, %exit : (i32, i32) -> i1
+    %next = emitc.add %arg1, %arg1 : (i32, i32) -> i32
+    scf.condition(%condition) %next : i32
+  } do {
+  ^bb0(%arg2: i32):
+    %next_arg1 = emitc.call @payload_one_result(%arg2) : (i32) -> i32
+    scf.yield %next_arg1 : i32
+  }
+  
+  return %res : i32
+}
+// CHECK-LABEL:   emitc.func @payload_one_result(
+// CHECK-SAME:      %[[ARG0:.*]]: i32) -> i32 {
+// CHECK:           %[[VAL_0:.*]] = add %[[ARG0]], %[[ARG0]] : (i32, i32) -> i32
+// CHECK:           return %[[VAL_0]] : i32
+// CHECK:         }
+
+// CHECK-LABEL:   func.func @one_result() -> i32 {
+// CHECK:           %[[VAL_0:.*]] = emitc.literal "1.0" : i32
+// CHECK:           %[[VAL_1:.*]] = emitc.literal "1.0" : i32
+// CHECK:           %[[VAL_2:.*]] = emitc.literal "10.0" : i32
+// CHECK:           %[[VAL_3:.*]] = "emitc.variable"() <{value = #emitc.opaque<"">}> : () -> !emitc.lvalue<i32>
+// CHECK:           %[[VAL_4:.*]] = "emitc.variable"() <{value = #emitc.opaque<"">}> : () -> !emitc.lvalue<i32>
+// CHECK:           emitc.assign %[[VAL_0]] : i32 to %[[VAL_4]] : <i32>
+// CHECK:           %[[VAL_5:.*]] = "emitc.variable"() <{value = #emitc.opaque<"">}> : () -> !emitc.lvalue<i1>
+// CHECK:           emitc.do {
+// CHECK:             %[[VAL_6:.*]] = load %[[VAL_4]] : <i32>
+// CHECK:             %[[VAL_7:.*]] = add %[[VAL_6]], %[[VAL_1]] : (i32, i32) -> i32
+// CHECK:             %[[VAL_8:.*]] = cmp lt, %[[VAL_7]], %[[VAL_2]] : (i32, i32) -> i1
+// CHECK:             %[[VAL_9:.*]] = add %[[VAL_6]], %[[VAL_6]] : (i32, i32) -> i32
+// CHECK:             assign %[[VAL_9]] : i32 to %[[VAL_3]] : <i32>
+// CHECK:             assign %[[VAL_8]] : i1 to %[[VAL_5]] : <i1>
+// CHECK:             if %[[VAL_8]] {
+// CHECK:               %[[VAL_10:.*]] = call @payload_one_result(%[[VAL_9]]) : (i32) -> i32
+// CHECK:               assign %[[VAL_10]] : i32 to %[[VAL_4]] : <i32>
+// CHECK:             }
+// CHECK:           } while {
+// CHECK:             %[[VAL_11:.*]] = expression %[[VAL_5]] : (!emitc.lvalue<i1>) -> i1 {
+// CHECK:               %[[VAL_12:.*]] = load %[[VAL_5]] : <i1>
+// CHECK:               yield %[[VAL_12]] : i1
+// CHECK:             }
+// CHECK:             yield %[[VAL_11]] : i1
+// CHECK:           }
+// CHECK:           %[[VAL_13:.*]] = emitc.load %[[VAL_3]] : <i32>
+// CHECK:           return %[[VAL_13]] : i32
+// CHECK:         }
+
+emitc.func @payload_two_results(%arg: i32) -> i32 {
+  %result = add %arg, %arg : (i32, i32) -> i32
+  return %result : i32
+}
+
+func.func @two_results() -> i32 {
+  %init = emitc.literal "1.0" : i32
+  %exit = emitc.literal "10.0" : i32
+
+  %res1, %res2 = scf.while (%arg1_1 = %init, %arg1_2 = %init) : (i32, i32) -> (i32, i32) {
+    %sum = emitc.add %arg1_1, %arg1_2 : (i32, i32) -> i32
+    %condition = emitc.cmp lt, %sum, %exit : (i32, i32) -> i1
+    scf.condition(%condition) %init, %arg1_2  : i32, i32
+  } do {
+  ^bb0(%arg2_1 : i32, %arg2_2 : i32):
+    %next1 = emitc.call @payload_two_results(%arg2_1) : (i32) -> i32
+    %next2 = emitc.call @payload_two_results(%arg2_2) : (i32) -> i32
+    scf.yield %next1, %next2 : i32, i32
+  }
+  
+  return %res1 : i32
+}
+// CHECK-LABEL:   emitc.func @payload_two_results(
+// CHECK-SAME:      %[[ARG0:.*]]: i32) -> i32 {
+// CHECK:           %[[VAL_0:.*]] = add %[[ARG0]], %[[ARG0]] : (i32, i32) -> i32
+// CHECK:           return %[[VAL_0]] : i32
+// CHECK:         }
+
+// CHECK-LABEL:   func.func @two_results() -> i32 {
+// CHECK:           %[[VAL_0:.*]] = emitc.literal "1.0" : i32
+// CHECK:           %[[VAL_1:.*]] = emitc.literal "10.0" : i32
+// CHECK:           %[[VAL_2:.*]] = "emitc.variable"() <{value = #emitc.opaque<"">}> : () -> !emitc.lvalue<i32>
+// CHECK:           %[[VAL_3:.*]] = "emitc.variable"() <{value = #emitc.opaque<"">}> : () -> !emitc.lvalue<i32>
+// CHECK:           %[[VAL_4:.*]] = "emitc.variable"() <{value = #emitc.opaque<"">}> : () -> !emitc.lvalue<i32>
+// CHECK:           emitc.assign %[[VAL_0]] : i32 to %[[VAL_4]] : <i32>
+// CHECK:           %[[VAL_5:.*]] = "emitc.variable"() <{value = #emitc.opaque<"">}> : () -> !emitc.lvalue<i32>
+// CHECK:           emitc.assign %[[VAL_0]] : i32 to %[[VAL_5]] : <i32>
+// CHECK:           %[[VAL_6:.*]] = "emitc.variable"() <{value = #emitc.opaque<"">}> : () -> !emitc.lvalue<i1>
+// CHECK:           emitc.do {
+// CHECK:             %[[VAL_7:.*]] = load %[[VAL_4]] : <i32>
+// CHECK:             %[[VAL_8:.*]] = load %[[VAL_5]] : <i32>
+// CHECK:             %[[VAL_9:.*]] = add %[[VAL_7]], %[[VAL_8]] : (i32, i32) -> i32
+// CHECK:             %[[VAL_10:.*]] = cmp lt, %[[VAL_9]], %[[VAL_1]] : (i32, i32) -> i1
+// CHECK:             assign %[[VAL_0]] : i32 to %[[VAL_2]] : <i32>
+// CHECK:             assign %[[VAL_8]] : i32 to %[[VAL_3]] : <i32>
+// CHECK:             assign %[[VAL_10]] : i1 to %[[VAL_6]] : <i1>
+// CHECK:             if %[[VAL_10]] {
+// CHECK:               %[[VAL_11:.*]] = call @payload_two_results(%[[VAL_0]]) : (i32) -> i32
+// CHECK:               %[[VAL_12:.*]] = call @payload_two_results(%[[VAL_8]]) : (i32) -> i32
+// CHECK:               assign %[[VAL_11]] : i32 to %[[VAL_4]] : <i32>
+// CHECK:               assign %[[VAL_12]] : i32 to %[[VAL_5]] : <i32>
+// CHECK:             }
+// CHECK:           } while {
+// CHECK:             %[[VAL_13:.*]] = expression %[[VAL_6]] : (!emitc.lvalue<i1>) -> i1 {
+// CHECK:               %[[VAL_14:.*]] = load %[[VAL_6]] : <i1>
+// CHECK:               yield %[[VAL_14]] : i1
+// CHECK:             }
+// CHECK:             yield %[[VAL_13]] : i1
+// CHECK:           }
+// CHECK:           %[[VAL_15:.*]] = emitc.load %[[VAL_2]] : <i32>
+// CHECK:           %[[VAL_16:.*]] = emitc.load %[[VAL_3]] : <i32>
+// CHECK:           return %[[VAL_15]] : i32
+// CHECK:         }
+
+emitc.func @payload_double_use(%arg: i32) -> i32 {
+  %result = add %arg, %arg : (i32, i32) -> i32
+  return %result : i32
+}
+
+emitc.func @foo_with_side_effect(%arg: i32, %p : !emitc.ptr<i32>) -> i32 {
+  %sum = add %arg, %arg : (i32, i32) -> i32
+  emitc.verbatim "{}[0] = {};" args %p, %sum : !emitc.ptr<i32>, i32
+  return %sum : i32
+}
+
+func.func @double_use(%p : !emitc.ptr<i32>) -> i32 {
+  %init = emitc.literal "1.0" : i32
+  %var  = emitc.literal "1.0" : i32
+  %exit = emitc.literal "10.0" : i32
+  %res = scf.while (%arg1 = %init) : (i32) -> i32 {
+    %used_twice = emitc.call @foo_with_side_effect(%arg1, %p) : (i32, !emitc.ptr<i32>) -> i32
+    %prod = emitc.add %used_twice, %used_twice : (i32, i32) -> i32
+    %sum = emitc.add %arg1, %prod : (i32, i32) -> i32
+    %condition = emitc.cmp lt, %sum, %exit : (i32, i32) -> i1
+    scf.condition(%condition) %arg1 : i32
+  } do {
+  ^bb0(%arg2: i32):
+    %next_arg1 = emitc.call @payload_double_use(%arg2) : (i32) -> i32
+    scf.yield %next_arg1 : i32
+  }
+  return %res : i32
+}
+// CHECK-LABEL:   emitc.func @payload_double_use(
+// CHECK-SAME:      %[[ARG0:.*]]: i32) -> i32 {
+// CHECK:           %[[VAL_0:.*]] = add %[[ARG0]], %[[ARG0]] : (i32, i32) -> i32
+// CHECK:           return %[[VAL_0]] : i32
+// CHECK:         }
+
+// CHECK-LABEL:   emitc.func @foo_with_side_effect(
+// CHECK-SAME:      %[[ARG0:.*]]: i32,
+// CHECK-SAME:      %[[ARG1:.*]]: !emitc.ptr<i32>) -> i32 {
+// CHECK:           %[[VAL_0:.*]] = add %[[ARG0]], %[[ARG0]] : (i32, i32) -> i32
+// CHECK:           verbatim "{}[0] = {};" args %[[ARG1]], %[[VAL_0]] : !emitc.ptr<i32>, i32
+// CHECK:           return %[[VAL_0]] : i32
+// CHECK:         }
+
+// CHECK-LABEL:   func.func @double_use(
+// CHECK-SAME:      %[[ARG0:.*]]: !emitc.ptr<i32>) -> i32 {
+// CHECK:           %[[VAL_0:.*]] = emitc.literal "1.0" : i32
+// CHECK:           %[[VAL_1:.*]] = emitc.literal "1.0" : i32
+// CHECK:           %[[VAL_2:.*]] = emitc.literal "10.0" : i32
+// CHECK:           %[[VAL_3:.*]] = "emitc.variable"() <{value = #emitc.opaque<"">}> : () -> !emitc.lvalue<i32>
+// CHECK:           %[[VAL_4:.*]] = "emitc.variable"() <{value = #emitc.opaque<"">}> : () -> !emitc.lvalue<i32>
+// CHECK:           emitc.assign %[[VAL_0]] : i32 to %[[VAL_4]] : <i32>
+// CHECK:           %[[VAL_5:.*]] = "emitc.variable"() <{value = #emitc.opaque<"">}> : () -> !emitc.lvalue<i1>
+// CHECK:           emitc.do {
+// CHECK:             %[[VAL_6:.*]] = load %[[VAL_4]] : <i32>
+// CHECK:             %[[VAL_7:.*]] = call @foo_with_side_effect(%[[VAL_6]], %[[ARG0]]) : (i32, !emitc.ptr<i32>) -> i32
+// CHECK:             %[[VAL_8:.*]] = add %[[VAL_7]], %[[VAL_7]] : (i32, i32) -> i32
+// CHECK:             %[[VAL_9:.*]] = add %[[VAL_6]], %[[VAL_8]] : (i32, i32) -> i32
+// CHECK:             %[[VAL_10:.*]] = cmp lt, %[[VAL_9]], %[[VAL_2]] : (i32, i32) -> i1
+// CHECK:             assign %[[VAL_6]] : i32 to %[[VAL_3]] : <i32>
+// CHECK:             assign %[[VAL_10]] : i1 to %[[VAL_5]] : <i1>
+// CHECK:             if %[[VAL_10]] {
+// CHECK:               %[[VAL_11:.*]] = call @payload_double_use(%[[VAL_6]]) : (i32) -> i32
+// CHECK:               assign %[[VAL_11]] : i32 to %[[VAL_4]] : <i32>
+// CHECK:             }
+// CHECK:           } while {
+// CHECK:             %[[VAL_12:.*]] = expression %[[VAL_5]] : (!emitc.lvalue<i1>) -> i1 {
+// CHECK:               %[[VAL_13:.*]] = load %[[VAL_5]] : <i1>
+// CHECK:               yield %[[VAL_13]] : i1
+// CHECK:             }
+// CHECK:             yield %[[VAL_12]] : i1
+// CHECK:           }
+// CHECK:           %[[VAL_14:.*]] = emitc.load %[[VAL_3]] : <i32>
+// CHECK:           return %[[VAL_14]] : i32
+// CHECK:         }
+
+emitc.func @payload_empty_after_region() -> i1 {
+  %true = emitc.literal "true" : i1
+  return %true : i1
+}
+
+func.func @empty_after_region() {
+  scf.while () : () -> () {
+    %condition = emitc.call @payload_empty_after_region() : () -> i1
+    scf.condition(%condition)
+  } do {
+  ^bb0():
+    scf.yield
+  }
+  return
+}
+// CHECK-LABEL:   emitc.func @payload_empty_after_region() -> i1 {
+// CHECK:           %[[VAL_0:.*]] = literal "true" : i1
+// CHECK:           return %[[VAL_0]] : i1
+// CHECK:         }
+
+// CHECK-LABEL:   func.func @empty_after_region() {
+// CHECK:           %[[VAL_0:.*]] = "emitc.variable"() <{value = #emitc.opaque<"">}> : () -> !emitc.lvalue<i1>
+// CHECK:           emitc.do {
+// CHECK:             %[[VAL_1:.*]] = call @payload_empty_after_region() : () -> i1
+// CHECK:             assign %[[VAL_1]] : i1 to %[[VAL_0]] : <i1>
+// CHECK:           } while {
+// CHECK:             %[[VAL_2:.*]] = expression %[[VAL_0]] : (!emitc.lvalue<i1>) -> i1 {
+// CHECK:               %[[VAL_3:.*]] = load %[[VAL_0]] : <i1>
+// CHECK:               yield %[[VAL_3]] : i1
+// CHECK:             }
+// CHECK:             yield %[[VAL_2]] : i1
+// CHECK:           }
+// CHECK:           return
+// CHECK:         }
+
+emitc.func @payload_different_number_of_vars(%arg0: i32) -> i32 {
+  %0 = add %arg0, %arg0 : (i32, i32) -> i32
+  return %0 : i32
+}
+func.func @different_number_of_vars() -> (i32, i32) {
+  %init = emitc.literal "1.0" : i32
+  %var  = emitc.literal "7.0" : i32
+  %exit = emitc.literal "10.0" : i32
+  %res, %res2 = scf.while (%arg1 = %init) : (i32) -> (i32, i32) {
+    %sum = emitc.add %arg1, %var : (i32, i32) -> i32
+    %condition = emitc.cmp lt, %sum, %exit : (i32, i32) -> i1
+    %next = emitc.add %arg1, %arg1 : (i32, i32) -> i32
+    scf.condition(%condition) %next, %sum : i32, i32
+  } do {
+  ^bb0(%arg2: i32, %arg3 : i32):
+    %next_arg1 = emitc.call @payload_different_number_of_vars(%arg2) : (i32) -> i32
+    scf.yield %next_arg1 : i32
+  }
+  return %res, %res2 : i32, i32
+}
+// CHECK-LABEL:   emitc.func @payload_different_number_of_vars(
+// CHECK-SAME:      %[[ARG0:.*]]: i32) -> i32 {
+// CHECK:           %[[VAL_0:.*]] = add %[[ARG0]], %[[ARG0]] : (i32, i32) -> i32
+// CHECK:           return %[[VAL_0]] : i32
+// CHECK:         }
+
+// CHECK-LABEL:   func.func @different_number_of_vars() -> (i32, i32) {
+// CHECK:           %[[VAL_0:.*]] = emitc.literal "1.0" : i32
+// CHECK:           %[[VAL_1:.*]] = emitc.literal "7.0" : i32
+// CHECK:           %[[VAL_2:.*]] = emitc.literal "10.0" : i32
+// CHECK:           %[[VAL_3:.*]] = "emitc.variable"() <{value = #emitc.opaque<"">}> : () -> !emitc.lvalue<i32>
+// CHECK:           %[[VAL_4:.*]] = "emitc.variable"() <{value = #emitc.opaque<"">}> : () -> !emitc.lvalue<i32>
+// CHECK:           %[[VAL_5:.*]] = "emitc.variable"() <{value = #emitc.opaque<"">}> : () -> !emitc.lvalue<i32>
+// CHECK:           emitc.assign %[[VAL_0]] : i32 to %[[VAL_5]] : <i32>
+// CHECK:           %[[VAL_6:.*]] = "emitc.variable"() <{value = #emitc.opaque<"">}> : () -> !emitc.lvalue<i1>
+// CHECK:           emitc.do {
+// CHECK:             %[[VAL_7:.*]] = load %[[VAL_5]] : <i32>
+// CHECK:             %[[VAL_8:.*]] = add %[[VAL_7]], %[[VAL_1]] : (i32, i32) -> i32
+// CHECK:             %[[VAL_9:.*]] = cmp lt, %[[VAL_8]], %[[VAL_2]] : (i32, i32) -> i1
+// CHECK:             %[[VAL_10:.*]] = add %[[VAL_7]], %[[VAL_7]] : (i32, i32) -> i32
+// CHECK:             assign %[[VAL_10]] : i32 to %[[VAL_3]] : <i32>
+// CHECK:             assign %[[VAL_8]] : i32 to %[[VAL_4]] : <i32>
+// CHECK:             assign %[[VAL_9]] : i1 to %[[VAL_6]] : <i1>
+// CHECK:             if %[[VAL_9]] {
+// CHECK:               %[[VAL_11:.*]] = call @payload_different_number_of_vars(%[[VAL_10]]) : (i32) -> i32
+// CHECK:               assign %[[VAL_11]] : i32 to %[[VAL_5]] : <i32>
+// CHECK:             }
+// CHECK:           } while {
+// CHECK:             %[[VAL_12:.*]] = expression %[[VAL_6]] : (!emitc.lvalue<i1>) -> i1 {
+// CHECK:               %[[VAL_13:.*]] = load %[[VAL_6]] : <i1>
+// CHECK:               yield %[[VAL_13]] : i1
+// CHECK:             }
+// CHECK:             yield %[[VAL_12]] : i1
+// CHECK:           }
+// CHECK:           %[[VAL_14:.*]] = emitc.load %[[VAL_3]] : <i32>
+// CHECK:           %[[VAL_15:.*]] = emitc.load %[[VAL_4]] : <i32>
+// CHECK:           return %[[VAL_14]], %[[VAL_15]] : i32, i32
+// CHECK:         }
diff --git a/mlir/test/Conversion/XeVMToLLVM/xevm-to-llvm.mlir b/mlir/test/Conversion/XeVMToLLVM/xevm-to-llvm.mlir
index 8f60a07..b31a973 100644
--- a/mlir/test/Conversion/XeVMToLLVM/xevm-to-llvm.mlir
+++ b/mlir/test/Conversion/XeVMToLLVM/xevm-to-llvm.mlir
@@ -261,3 +261,87 @@ llvm.func @llvm.store(%a: !llvm.ptr<1>, %val: i32) {
   llvm.store %val, %a {cache_control=#xevm.store_cache_control<L1wt_L2uc_L3wb>} : i32, !llvm.ptr<1>
   llvm.return
 }
+
+// -----
+// CHECK-LABEL: llvm.func spir_funccc @_Z30intel_sub_group_block_read_us8PU3AS1t
+// CHECK: llvm.func @blockload_as1(%[[ARG0:.*]]: !llvm.ptr<1>)
+llvm.func @blockload_as1(%ptr: !llvm.ptr<1>) -> vector<8xi16> {
+  // CHECK: %[[VAR0:.*]] = llvm.call spir_funccc @_Z30intel_sub_group_block_read_us8PU3AS1t(%[[ARG0]])
+  // CHECK-SAME: {function_type = !llvm.func<vector<8xi16> (ptr<1>)>, linkage = #llvm.linkage<external>,
+  // CHECK-SAME:  no_unwind, sym_name = "_Z30intel_sub_group_block_read_us8PU3AS1t",
+  // CHECK-SAME:  visibility_ = 0 : i64, will_return, xevm.DecorationCacheControl =
+  // CHECK-SAME:    [6442 : i32, 0 : i32, 1 : i32, 0 : i32],
+  // CHECK-SAME:    [6442 : i32, 1 : i32, 1 : i32, 0 : i32]
+  %loaded_a = xevm.blockload %ptr <{cache_control=#xevm.load_cache_control<L1uc_L2uc_L3uc>}> : (!llvm.ptr<1>) -> vector<8xi16>
+  llvm.return %loaded_a : vector<8xi16>
+}
+
+// -----
+// CHECK-LABEL: llvm.func spir_funccc @_Z31intel_sub_group_block_read_uc16PU3AS3h(!llvm.ptr<3>)
+// CHECK: llvm.func @blockload_as3(%[[ARG0:.*]]: !llvm.ptr<3>)
+llvm.func @blockload_as3(%ptr: !llvm.ptr<3>) -> vector<16xi8> {
+  // CHECK: %[[VAR0:.*]] = llvm.call spir_funccc @_Z31intel_sub_group_block_read_uc16PU3AS3h(%[[ARG0]])
+  // CHECK-SAME: {function_type = !llvm.func<vector<16xi8> (ptr<3>)>, linkage = #llvm.linkage<external>,
+  // CHECK-SAME:   no_unwind, sym_name = "_Z31intel_sub_group_block_read_uc16PU3AS3h", visibility_ = 0 : i64,
+  // CHECK-SAME:   will_return, xevm.DecorationCacheControl =
+  // CHECK-SAME:    [6442 : i32, 0 : i32, 1 : i32, 0 : i32],
+  // CHECK-SAME:    [6442 : i32, 1 : i32, 1 : i32, 0 : i32]
+  %loaded_a = xevm.blockload %ptr <{cache_control=#xevm.load_cache_control<L1uc_L2uc_L3uc>}> : (!llvm.ptr<3>) -> vector<16xi8>
+  llvm.return %loaded_a : vector<16xi8>
+}
+
+// -----
+// CHECK-LABEL: llvm.func spir_funccc @_Z29intel_sub_group_block_read_ucPU3AS3h(!llvm.ptr<3>)
+// CHECK: llvm.func @blockload_scalar(%[[ARG0:.*]]: !llvm.ptr<3>)
+llvm.func @blockload_scalar(%ptr: !llvm.ptr<3>) -> i8 {
+  // CHECK: %[[VAR0:.*]] = llvm.call spir_funccc @_Z29intel_sub_group_block_read_ucPU3AS3h(%[[ARG0]])
+  // CHECK-SAME: {function_type = !llvm.func<i8 (ptr<3>)>, linkage = #llvm.linkage<external>,
+  // CHECK-SAME:   no_unwind, sym_name = "_Z29intel_sub_group_block_read_ucPU3AS3h", visibility_ = 0 : i64,
+  // CHECK-SAME:   will_return, xevm.DecorationCacheControl =
+  // CHECK-SAME:    [6442 : i32, 0 : i32, 1 : i32, 0 : i32],
+  // CHECK-SAME:    [6442 : i32, 1 : i32, 1 : i32, 0 : i32]
+  %loaded_a = xevm.blockload %ptr <{cache_control=#xevm.load_cache_control<L1uc_L2uc_L3uc>}> : (!llvm.ptr<3>) -> i8
+  llvm.return %loaded_a : i8
+}
+
+// -----
+// CHECK-LABEL: llvm.func spir_funccc @_Z31intel_sub_group_block_write_ui8PU3AS1jDv8_j
+// CHECK: llvm.func @blockstore_as1(%[[ARG0:.*]]: !llvm.ptr<1>, %[[ARG1:.*]]: vector<8xi32>) {
+llvm.func @blockstore_as1(%ptr: !llvm.ptr<1>, %data: vector<8xi32>) {
+  // CHECK: llvm.call spir_funccc @_Z31intel_sub_group_block_write_ui8PU3AS1jDv8_j(%[[ARG0]], %[[ARG1]])
+  // CHECK-SAME: {function_type = !llvm.func<void (ptr<1>, vector<8xi32>)>, linkage = #llvm.linkage<external>,
+  // CHECK-SAME:   no_unwind, sym_name = "_Z31intel_sub_group_block_write_ui8PU3AS1jDv8_j", visibility_ = 0 : i64,
+  // CHECK-SAME:   will_return, xevm.DecorationCacheControl =
+  // CHECK-SAME:    [6443 : i32, 0 : i32, 2 : i32, 0 : i32],
+  // CHECK-SAME:    [6443 : i32, 1 : i32, 2 : i32, 0 : i32]
+  xevm.blockstore %ptr, %data <{cache_control=#xevm.store_cache_control<L1wt_L2uc_L3wb>}> : (!llvm.ptr<1>, vector<8xi32>)
+  llvm.return
+}
+
+// -----
+// CHECK-LABEL: llvm.func spir_funccc @_Z31intel_sub_group_block_write_ul2PU3AS3mDv2_m
+// CHECK: llvm.func @blockstore_as3(%[[ARG0:.*]]: !llvm.ptr<3>, %[[ARG1:.*]]: vector<2xi64>) {
+llvm.func @blockstore_as3(%ptr: !llvm.ptr<3>, %data: vector<2xi64>) {
+  // CHECK: llvm.call spir_funccc @_Z31intel_sub_group_block_write_ul2PU3AS3mDv2_m(%[[ARG0]], %[[ARG1]])
+  // CHECK-SAME: {function_type = !llvm.func<void (ptr<3>, vector<2xi64>)>, linkage = #llvm.linkage<external>,
+  // CHECK-SAME:   no_unwind, sym_name = "_Z31intel_sub_group_block_write_ul2PU3AS3mDv2_m", visibility_ = 0 : i64,
+  // CHECK-SAME:   will_return, xevm.DecorationCacheControl =
+  // CHECK-SAME:    [6443 : i32, 0 : i32, 2 : i32, 0 : i32],
+  // CHECK-SAME:    [6443 : i32, 1 : i32, 2 : i32, 0 : i32]
+  xevm.blockstore %ptr, %data <{cache_control=#xevm.store_cache_control<L1wt_L2uc_L3wb>}> : (!llvm.ptr<3>, vector<2xi64>)
+  llvm.return
+}
+
+// -----
+// CHECK-LABEL: llvm.func spir_funccc @_Z30intel_sub_group_block_write_ulPU3AS3mm
+// CHECK: llvm.func @blockstore_scalar(%[[ARG0:.*]]: !llvm.ptr<3>, %[[ARG1:.*]]: i64) {
+llvm.func @blockstore_scalar(%ptr: !llvm.ptr<3>, %data: i64) {
+  // CHECK: llvm.call spir_funccc @_Z30intel_sub_group_block_write_ulPU3AS3mm(%[[ARG0]], %[[ARG1]])
+  // CHECK-SAME: {function_type = !llvm.func<void (ptr<3>, i64)>, linkage = #llvm.linkage<external>,
+  // CHECK-SAME:   no_unwind, sym_name = "_Z30intel_sub_group_block_write_ulPU3AS3mm", visibility_ = 0 : i64,
+  // CHECK-SAME:   will_return, xevm.DecorationCacheControl =
+  // CHECK-SAME:    [6443 : i32, 0 : i32, 2 : i32, 0 : i32],
+  // CHECK-SAME:    [6443 : i32, 1 : i32, 2 : i32, 0 : i32]
+  xevm.blockstore %ptr, %data <{cache_control=#xevm.store_cache_control<L1wt_L2uc_L3wb>}> : (!llvm.ptr<3>, i64)
+  llvm.return
+}
diff --git a/mlir/test/Dialect/EmitC/invalid_ops.mlir b/mlir/test/Dialect/EmitC/invalid_ops.mlir
index f4c15f5..5f594fb 100644
--- a/mlir/test/Dialect/EmitC/invalid_ops.mlir
+++ b/mlir/test/Dialect/EmitC/invalid_ops.mlir
@@ -252,7 +252,7 @@ func.func @sub_pointer_pointer(%arg0: !emitc.ptr<f32>, %arg1: !emitc.ptr<f32>) {
 // -----
 
 func.func @test_misplaced_yield() {
-  // expected-error @+1 {{'emitc.yield' op expects parent op to be one of 'emitc.expression, emitc.if, emitc.for, emitc.switch'}}
+  // expected-error @+1 {{'emitc.yield' op expects parent op to be one of 'emitc.do, emitc.expression, emitc.for, emitc.if, emitc.switch'}}
   emitc.yield
   return
 }
@@ -729,3 +729,150 @@ emitc.class @testClass {
     return
   }
 }
+
+// -----
+
+func.func @test_do(%arg0 : !emitc.ptr<i32>) {
+  %1 = emitc.literal "1" : i32
+  %2 = emitc.literal "2" : i32
+
+  // expected-error @+1 {{'emitc.do' op condition region must contain exactly two operations: 'emitc.expression' followed by 'emitc.yield', but found 3 operations}}
+  emitc.do {
+    emitc.verbatim "printf(\"%d\", *{});" args %arg0 : !emitc.ptr<i32>
+  } while {
+    %r = emitc.expression %1, %2 : (i32, i32) -> i1 {
+      %cmp = emitc.cmp eq, %1, %2 : (i32, i32) -> i1
+      emitc.yield %cmp : i1
+    }
+
+    %3 = emitc.literal "3" : i32
+    emitc.yield %r : i1
+  }
+
+  return
+}
+
+// -----
+
+func.func @test_do(%arg0 : !emitc.ptr<i32>) {
+  // expected-error @+1 {{'emitc.do' op expected first op in condition region to be 'emitc.expression', but got emitc.literal}}
+  emitc.do {
+    emitc.verbatim "printf(\"%d\", *{});" args %arg0 : !emitc.ptr<i32>
+  } while {
+    %true = emitc.literal "true" : i1
+    emitc.yield %true : i1
+  }
+
+  return
+}
+
+// -----
+
+func.func @test_do(%arg0 : !emitc.ptr<i32>) {
+  %1 = emitc.literal "1" : i32
+  %2 = emitc.literal "2" : i32
+
+  // expected-error @+1 {{'emitc.do' op emitc.expression in condition region must return 'i1', but returns 'i32'}}
+  emitc.do {
+    emitc.verbatim "printf(\"%d\", *{});" args %arg0 : !emitc.ptr<i32>
+  } while {
+    %r = emitc.expression %1, %2 : (i32, i32) -> i32 {
+      %add = emitc.add %1, %2 : (i32, i32) -> i32
+      emitc.yield %add : i32
+    }
+
+    emitc.yield %r : i32
+  }
+
+  return
+}
+
+// -----
+
+func.func @test_do(%arg0 : !emitc.ptr<i32>) {
+  %1 = emitc.literal "1" : i32
+  %2 = emitc.literal "2" : i32
+
+  // expected-error @+1 {{'emitc.do' op expected last op in condition region to be 'emitc.yield', but got emitc.expression}}
+  emitc.do {
+    emitc.verbatim "printf(\"%d\", *{});" args %arg0 : !emitc.ptr<i32>
+  } while {
+    %r1 = emitc.expression %1, %2 : (i32, i32) -> i1 {
+      %cmp = emitc.cmp eq, %1, %2 : (i32, i32) -> i1
+      emitc.yield %cmp : i1
+    }
+
+    %r2 = emitc.expression %1, %2 : (i32, i32) -> i32 {
+      %add = emitc.add %1, %2 : (i32, i32) -> i32
+      emitc.yield %add : i32
+    }
+  }
+
+  return
+}
+
+// -----
+
+func.func @test_do(%arg0 : !emitc.ptr<i32>) {
+  %1 = emitc.literal "1" : i32
+  %2 = emitc.literal "2" : i32
+
+  // expected-error @+1 {{'emitc.do' op expected condition region to return 1 value, but it returns 0 values}}
+  emitc.do {
+    emitc.verbatim "printf(\"%d\", *{});" args %arg0 : !emitc.ptr<i32>
+  } while {
+    %r = emitc.expression %1, %2 : (i32, i32) -> i1 {
+      %cmp = emitc.cmp eq, %1, %2 : (i32, i32) -> i1
+      emitc.yield %cmp : i1
+    }
+
+    emitc.yield
+  }
+
+  return
+}
+
+// -----
+
+func.func @test_do(%arg0 : !emitc.ptr<i32>) {
+  %1 = emitc.literal "1" : i32
+  %2 = emitc.literal "2" : i32
+
+  %true = emitc.literal "true" : i1
+
+  // expected-error @+1 {{'emitc.yield' must return result of 'emitc.expression' from this condition region}}
+  emitc.do {
+    emitc.verbatim "printf(\"%d\", *{});" args %arg0 : !emitc.ptr<i32>
+  } while {
+    %r = emitc.expression %1, %2 : (i32, i32) -> i1 {
+      %cmp = emitc.cmp eq, %1, %2 : (i32, i32) -> i1
+      emitc.yield %cmp : i1
+    }
+
+    emitc.yield %true: i1
+  }
+
+  return
+}
+
+// -----
+
+func.func @test_do(%arg0 : !emitc.ptr<i32>) {
+  %1 = emitc.literal "1" : i32
+  %2 = emitc.literal "2" : i32
+
+  // expected-error @+1 {{'emitc.do' op body region must not contain terminator}}
+  emitc.do {
+    emitc.verbatim "printf(\"%d\", *{});" args %arg0 : !emitc.ptr<i32>
+    emitc.yield
+  } while {
+    %r = emitc.expression %1, %2 : (i32, i32) -> i1 {
+      %cmp = emitc.cmp eq, %1, %2 : (i32, i32) -> i1
+      emitc.yield %cmp : i1
+    }
+
+    emitc.yield %r: i1
+  }
+
+  return
+}
diff --git a/mlir/test/Dialect/EmitC/ops.mlir b/mlir/test/Dialect/EmitC/ops.mlir
index 84c9b65..1259748 100644
--- a/mlir/test/Dialect/EmitC/ops.mlir
+++ b/mlir/test/Dialect/EmitC/ops.mlir
@@ -335,3 +335,23 @@ emitc.class final @finalClass {
     return
   }
 }
+
+func.func @do(%arg0 : !emitc.ptr<i32>) {
+  %1 = emitc.literal "1" : i32
+  %2 = emitc.literal "2" : i32
+  %3 = emitc.literal "3" : i32
+
+  emitc.do {
+    emitc.verbatim "printf(\"%d\", *{});" args %arg0 : !emitc.ptr<i32>
+  } while {
+    %r = emitc.expression %1, %2, %3 : (i32, i32, i32) -> i1 {
+      %add = emitc.add %1, %2 : (i32, i32) -> i32
+      %cmp = emitc.cmp eq, %add, %3 : (i32, i32) -> i1
+      emitc.yield %cmp : i1
+    }
+    
+    emitc.yield %r : i1
+  }
+
+  return
+}
diff --git a/mlir/test/Dialect/Linalg/decompose-pack.mlir b/mlir/test/Dialect/Linalg/decompose-pack.mlir
index 17e6c29..18a09f4 100644
--- a/mlir/test/Dialect/Linalg/decompose-pack.mlir
+++ b/mlir/test/Dialect/Linalg/decompose-pack.mlir
@@ -274,3 +274,24 @@ func.func @pack_with_adjacent_trailing_dimensions_inner_dims_pos_and_unit_outer(
 // CHECK:         %[[INSERT:.+]] = tensor.insert_slice %[[TRANSP]] into %[[DEST]]
 // CHECK-SAME:      [0, 0, 0, 0, 0] [1, 1, 1, 4, 1] [1, 1, 1, 1, 1] : tensor<1x4x1xf32> into tensor<1x1x1x4x1xf32>
 // CHECK:         return %[[INSERT]]
+
+// -----
+
+// The following example shows a pack operation where the inner dims
+// positions are non-adjacent and non-permuted.
+func.func @pack_with_non_adjacent_and_non_permuted_inner_dims(%arg0: tensor<8x1x1x1xf32>, %arg1:tensor<1x1x1x1x8x1xf32>) -> tensor<1x1x1x1x8x1xf32> {
+  %pack = linalg.pack %arg0 outer_dims_perm = [0, 1, 2, 3] inner_dims_pos = [0, 3] inner_tiles = [8, 1] into %arg1: tensor<8x1x1x1xf32> -> tensor<1x1x1x1x8x1xf32>
+  return %pack : tensor<1x1x1x1x8x1xf32>
+}
+
+// CHECK-LABEL: func.func @pack_with_non_adjacent_and_non_permuted_inner_dims
+// CHECK-SAME:    %[[SRC:[a-zA-Z0-9]+]]
+// CHECK-SAME:    %[[DEST:[a-zA-Z0-9]+]]
+// CHECK:         %[[EMPTY:.+]] = tensor.empty() : tensor<1x1x8x1xf32>
+// CHECK:         %[[TRANSP:.+]] = linalg.transpose
+// CHECK-SAME:      ins(%[[SRC]] : tensor<8x1x1x1xf32>)
+// CHECK-SAME:      outs(%[[EMPTY]] : tensor<1x1x8x1xf32>)
+// CHECK-SAME:      permutation = [1, 2, 0, 3]
+// CHECK:         %[[INSERT:.+]] = tensor.insert_slice %[[TRANSP]] into %[[DEST]]
+// CHECK-SAME:      [0, 0, 0, 0, 0, 0] [1, 1, 1, 1, 8, 1] [1, 1, 1, 1, 1, 1] : tensor<1x1x8x1xf32> into tensor<1x1x1x1x8x1xf32>
+// CHECK:         return %[[INSERT]]
diff --git a/mlir/test/Dialect/OpenACC/pointer-like-interface-alloc.mlir b/mlir/test/Dialect/OpenACC/pointer-like-interface-alloc.mlir
new file mode 100644
index 0000000..603ace8
--- /dev/null
+++ b/mlir/test/Dialect/OpenACC/pointer-like-interface-alloc.mlir
@@ -0,0 +1,24 @@
+// RUN: mlir-opt %s --split-input-file --pass-pipeline="builtin.module(func.func(test-acc-pointer-like-interface{test-mode=alloc}))" 2>&1 | FileCheck %s
+
+func.func @test_static_memref_alloc() {
+  %0 = memref.alloca() {test.ptr} : memref<10x20xf32>
+  // CHECK: Successfully generated alloc for operation: %[[ORIG:.*]] = memref.alloca() {test.ptr} : memref<10x20xf32>
+  // CHECK: Generated: %{{.*}} = memref.alloca() : memref<10x20xf32>
+  return
+}
+
+// -----
+
+func.func @test_dynamic_memref_alloc() {
+  %c10 = arith.constant 10 : index
+  %c20 = arith.constant 20 : index
+  %orig = memref.alloc(%c10, %c20) {test.ptr} : memref<?x?xf32>
+  
+  // CHECK: Successfully generated alloc for operation: %[[ORIG:.*]] = memref.alloc(%[[C10:.*]], %[[C20:.*]]) {test.ptr} : memref<?x?xf32>
+  // CHECK: Generated: %[[C0:.*]] = arith.constant 0 : index
+  // CHECK: Generated: %[[DIM0:.*]] = memref.dim %[[ORIG]], %[[C0]] : memref<?x?xf32>
+  // CHECK: Generated: %[[C1:.*]] = arith.constant 1 : index
+  // CHECK: Generated: %[[DIM1:.*]] = memref.dim %[[ORIG]], %[[C1]] : memref<?x?xf32>
+  // CHECK: Generated: %{{.*}} = memref.alloc(%[[DIM0]], %[[DIM1]]) : memref<?x?xf32>
+  return
+}
diff --git a/mlir/test/Dialect/OpenACC/pointer-like-interface-copy.mlir b/mlir/test/Dialect/OpenACC/pointer-like-interface-copy.mlir
new file mode 100644
index 0000000..9220d84
--- /dev/null
+++ b/mlir/test/Dialect/OpenACC/pointer-like-interface-copy.mlir
@@ -0,0 +1,23 @@
+// RUN: mlir-opt %s --split-input-file --pass-pipeline="builtin.module(func.func(test-acc-pointer-like-interface{test-mode=copy}))" 2>&1 | FileCheck %s
+
+func.func @test_copy_static() {
+  %src = memref.alloca() {test.src_ptr} : memref<10x20xf32>
+  %dest = memref.alloca() {test.dest_ptr} : memref<10x20xf32>
+  
+  // CHECK: Successfully generated copy from source: %[[SRC:.*]] = memref.alloca() {test.src_ptr} : memref<10x20xf32> to destination: %[[DEST:.*]] = memref.alloca() {test.dest_ptr} : memref<10x20xf32>
+  // CHECK: Generated: memref.copy %[[SRC]], %[[DEST]] : memref<10x20xf32> to memref<10x20xf32>
+  return
+}
+
+// -----
+
+func.func @test_copy_dynamic() {
+  %c10 = arith.constant 10 : index
+  %c20 = arith.constant 20 : index
+  %src = memref.alloc(%c10, %c20) {test.src_ptr} : memref<?x?xf32>
+  %dest = memref.alloc(%c10, %c20) {test.dest_ptr} : memref<?x?xf32>
+  
+  // CHECK: Successfully generated copy from source: %[[SRC:.*]] = memref.alloc(%[[C10:.*]], %[[C20:.*]]) {test.src_ptr} : memref<?x?xf32> to destination: %[[DEST:.*]] = memref.alloc(%[[C10]], %[[C20]]) {test.dest_ptr} : memref<?x?xf32>
+  // CHECK: Generated: memref.copy %[[SRC]], %[[DEST]] : memref<?x?xf32> to memref<?x?xf32>
+  return
+}
diff --git a/mlir/test/Dialect/OpenACC/pointer-like-interface-free.mlir b/mlir/test/Dialect/OpenACC/pointer-like-interface-free.mlir
new file mode 100644
index 0000000..ecf4f75
--- /dev/null
+++ b/mlir/test/Dialect/OpenACC/pointer-like-interface-free.mlir
@@ -0,0 +1,31 @@
+// RUN: mlir-opt %s --split-input-file --pass-pipeline="builtin.module(func.func(test-acc-pointer-like-interface{test-mode=free}))" 2>&1 | FileCheck %s
+
+func.func @test_static_memref_free() {
+  %0 = memref.alloca() {test.ptr} : memref<10x20xf32>
+  // CHECK: Successfully generated free for operation: %[[ORIG:.*]] = memref.alloca() {test.ptr} : memref<10x20xf32>
+  // CHECK-NOT: Generated
+  return
+}
+
+// -----
+
+func.func @test_dynamic_memref_free() {
+  %c10 = arith.constant 10 : index
+  %c20 = arith.constant 20 : index
+  %orig = memref.alloc(%c10, %c20) {test.ptr} : memref<?x?xf32>
+  
+  // CHECK: Successfully generated free for operation: %[[ORIG:.*]] = memref.alloc(%[[C10:.*]], %[[C20:.*]]) {test.ptr} : memref<?x?xf32>
+  // CHECK: Generated: memref.dealloc %[[ORIG]] : memref<?x?xf32>
+  return
+}
+
+// -----
+
+func.func @test_cast_walking_free() {
+  %0 = memref.alloca() : memref<10x20xf32>
+  %1 = memref.cast %0 {test.ptr} : memref<10x20xf32> to memref<?x?xf32>
+  
+  // CHECK: Successfully generated free for operation: %[[CAST:.*]] = memref.cast %[[ALLOCA:.*]] {test.ptr} : memref<10x20xf32> to memref<?x?xf32>
+  // CHECK-NOT: Generated
+  return
+}
diff --git a/mlir/test/Dialect/Vector/canonicalize.mlir b/mlir/test/Dialect/Vector/canonicalize.mlir
index c07edac..eb369c0 100644
--- a/mlir/test/Dialect/Vector/canonicalize.mlir
+++ b/mlir/test/Dialect/Vector/canonicalize.mlir
@@ -3322,6 +3322,46 @@ func.func @from_elements_to_elements_shuffle(%a: vector<4x2xf32>) -> vector<4x2x
 
 // -----
 
+// CHECK-LABEL: func @to_elements_of_scalar_broadcast_folds
+// CHECK-SAME: (%[[S:.*]]: f32) -> (f32, f32, f32, f32)
+func.func @to_elements_of_scalar_broadcast_folds(%s: f32) -> (f32, f32, f32, f32) {
+  %v = vector.broadcast %s : f32 to vector<4xf32>
+  %e:4 = vector.to_elements %v : vector<4xf32>
+  // CHECK-NOT: vector.broadcast
+  // CHECK-NOT: vector.to_elements
+  // CHECK: return %[[S]], %[[S]], %[[S]], %[[S]]
+  return %e#0, %e#1, %e#2, %e#3 : f32, f32, f32, f32
+}
+
+// -----
+
+// CHECK-LABEL: func @to_elements_of_vector_broadcast
+// CHECK-SAME: (%[[VEC:.*]]: vector<2xf32>) -> (f32, f32, f32, f32, f32, f32)
+func.func @to_elements_of_vector_broadcast(%vec: vector<2xf32>) -> (f32, f32, f32, f32, f32, f32) {
+  %v = vector.broadcast %vec : vector<2xf32> to vector<3x2xf32>
+  %e:6 = vector.to_elements %v : vector<3x2xf32>
+  // CHECK-NOT: vector.broadcast
+  // CHECK: %[[SRC_ELEMS:.*]]:2 = vector.to_elements %[[VEC]]
+  // CHECK: return %[[SRC_ELEMS]]#0, %[[SRC_ELEMS]]#1, %[[SRC_ELEMS]]#0, %[[SRC_ELEMS]]#1, %[[SRC_ELEMS]]#0, %[[SRC_ELEMS]]#1
+  return %e#0, %e#1, %e#2, %e#3, %e#4, %e#5 : f32, f32, f32, f32, f32, f32
+}
+
+// -----
+
+// CHECK-LABEL: func @to_elements_of_vector_broadcast_inner_dim
+// CHECK-SAME: (%[[V:.*]]: vector<2x1x2xf32>) -> (f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)
+func.func @to_elements_of_vector_broadcast_inner_dim(%v: vector<2x1x2xf32>) -> (f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32) {
+  %b = vector.broadcast %v : vector<2x1x2xf32> to vector<2x3x2xf32>
+  %e:12 = vector.to_elements %b : vector<2x3x2xf32>
+  // CHECK-NOT: vector.broadcast
+  // CHECK: %[[SRC:.*]]:4 = vector.to_elements %[[V]] : vector<2x1x2xf32>
+  // CHECK: return %[[SRC]]#0, %[[SRC]]#1, %[[SRC]]#0, %[[SRC]]#1, %[[SRC]]#0, %[[SRC]]#1, %[[SRC]]#2, %[[SRC]]#3, %[[SRC]]#2, %[[SRC]]#3, %[[SRC]]#2, %[[SRC]]#3
+  return %e#0, %e#1, %e#2, %e#3, %e#4, %e#5, %e#6, %e#7, %e#8, %e#9, %e#10, %e#11 :
+    f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32
+}
+
+// -----
+
 // +---------------------------------------------------------------------------
 // Tests for foldFromElementsToConstant
 // +---------------------------------------------------------------------------
diff --git a/mlir/test/Target/Cpp/do.mlir b/mlir/test/Target/Cpp/do.mlir
new file mode 100644
index 0000000..38cbc81
--- /dev/null
+++ b/mlir/test/Target/Cpp/do.mlir
@@ -0,0 +1,168 @@
+// RUN: mlir-translate -mlir-to-cpp %s | FileCheck %s -check-prefix=CPP-DEFAULT
+
+
+// CPP-DEFAULT-LABEL: void emitc_do(
+// CPP-DEFAULT:         int32_t* [[VAL_1:v[0-9]+]]) {
+// CPP-DEFAULT:         int32_t [[VAL_2:v[0-9]+]] = 0;
+// CPP-DEFAULT:         do {
+// CPP-DEFAULT:           printf("%d", *[[VAL_1]]);
+// CPP-DEFAULT:           int32_t [[VAL_3:v[0-9]+]] = [[VAL_2]];
+// CPP-DEFAULT:           int32_t [[VAL_4:v[0-9]+]] = [[VAL_3]] + 1;
+// CPP-DEFAULT:           [[VAL_2]] = [[VAL_4]];
+// CPP-DEFAULT:         } while ([[VAL_2]] <= 10);
+// CPP-DEFAULT:         return;
+// CPP-DEFAULT:       }
+
+emitc.func @emitc_do(%arg0 : !emitc.ptr<i32>) {
+  %var = "emitc.variable"() <{value = 0 : i32}> : () -> !emitc.lvalue<i32>
+  %0 = literal "10" : i32
+  %1 = literal "1" : i32
+
+  do {
+    verbatim "printf(\"%d\", *{});" args %arg0 : !emitc.ptr<i32>
+    %var_load = load %var : <i32>
+    %tmp_add = add %var_load, %1 : (i32, i32) -> i32
+    "emitc.assign"(%var, %tmp_add) : (!emitc.lvalue<i32>, i32) -> ()
+  } while {
+    %r = expression %var, %0 : (!emitc.lvalue<i32>, i32) -> i1 {
+      %var_load = load %var : <i32>
+      %cmp = cmp le, %var_load, %0 : (i32, i32) -> i1
+      yield %cmp : i1
+    }
+    
+    yield %r : i1
+  }
+
+  return
+}
+
+
+// CPP-DEFAULT-LABEL: void emitc_do_with_expression(
+// CPP-DEFAULT:         int32_t* [[VAL_1:v[0-9]+]]) {
+// CPP-DEFAULT:         int32_t [[VAL_2:v[0-9]+]] = 0;
+// CPP-DEFAULT:         int32_t [[VAL_3:v[0-9]+]] = 10 + 1;
+// CPP-DEFAULT:         do {
+// CPP-DEFAULT:           printf("%d", *[[VAL_1]]);
+// CPP-DEFAULT:           int32_t [[VAL_4:v[0-9]+]] = [[VAL_2]];
+// CPP-DEFAULT:           int32_t [[VAL_5:v[0-9]+]] = [[VAL_4]] + 1;
+// CPP-DEFAULT:           [[VAL_2]] = [[VAL_5]];
+// CPP-DEFAULT:         } while ([[VAL_2]] <= [[VAL_3]]);
+// CPP-DEFAULT:         return;
+// CPP-DEFAULT:       }
+
+emitc.func @emitc_do_with_expression(%arg0 : !emitc.ptr<i32>) {
+  %var = "emitc.variable"() <{value = 0 : i32}> : () -> !emitc.lvalue<i32>
+  %0 = literal "10" : i32
+  %1 = literal "1" : i32
+
+  %add = expression %0, %1 : (i32, i32) -> i32 {
+    %add = add %0, %1 : (i32, i32) -> i32
+    yield %add : i32
+  }
+
+  do {
+    verbatim "printf(\"%d\", *{});" args %arg0 : !emitc.ptr<i32>
+    %var_load = load %var : <i32>
+    %tmp_add = add %var_load, %1 : (i32, i32) -> i32
+    "emitc.assign"(%var, %tmp_add) : (!emitc.lvalue<i32>, i32) -> ()
+  } while {
+    %r = expression %var, %add : (!emitc.lvalue<i32>, i32) -> i1 {
+      %var_load = load %var : <i32>
+      %cmp = cmp le, %var_load, %add : (i32, i32) -> i1
+      yield %cmp : i1
+    }
+
+    yield %r : i1
+  }
+
+  return
+}
+
+
+// CPP-DEFAULT-LABEL: void emitc_double_do()
+// CPP-DEFAULT:         int32_t [[VAL_1:v[0-9]+]] = 0;
+// CPP-DEFAULT:         int32_t [[VAL_2:v[0-9]+]] = 0;
+// CPP-DEFAULT:         do {
+// CPP-DEFAULT:           int32_t [[VAL_3:v[0-9]+]] = [[VAL_1]];
+// CPP-DEFAULT:           do {
+// CPP-DEFAULT:             int32_t [[VAL_4:v[0-9]+]] = [[VAL_2]];
+// CPP-DEFAULT:             printf("i = %d, j = %d", [[VAL_3]], [[VAL_4]]);
+// CPP-DEFAULT:             int32_t [[VAL_5:v[0-9]+]] = [[VAL_4]] + 1;
+// CPP-DEFAULT:             [[VAL_2]] = [[VAL_5]];
+// CPP-DEFAULT:           } while ([[VAL_2]] <= 5);
+// CPP-DEFAULT:           int32_t [[VAL_6:v[0-9]+]] = [[VAL_3]] + 1;
+// CPP-DEFAULT:           [[VAL_1]] = [[VAL_6]];
+// CPP-DEFAULT:         } while ([[VAL_1]] <= 3);
+// CPP-DEFAULT:         return;
+// CPP-DEFAULT:       }
+
+emitc.func @emitc_double_do() {
+  %var_1 = "emitc.variable"() <{value = 0 : i32}> : () -> !emitc.lvalue<i32>
+  %var_2 = "emitc.variable"() <{value = 0 : i32}> : () -> !emitc.lvalue<i32>
+  
+  %step = literal "1" : i32
+  %end_1 = literal "3" : i32
+  %end_2 = literal "5" : i32
+
+  do {
+    %var_1_load = load %var_1 : <i32>
+    
+    do {
+      %var_2_load = load %var_2 : <i32>
+      verbatim "printf(\"i = %d, j = %d\", {}, {});" args %var_1_load, %var_2_load : i32, i32
+      %tmp_add = add %var_2_load, %step : (i32, i32) -> i32
+      "emitc.assign"(%var_2, %tmp_add) : (!emitc.lvalue<i32>, i32) -> ()
+    } while {
+      %r = expression %var_2, %end_2 : (!emitc.lvalue<i32>, i32) -> i1 {
+        %var_2_load = load %var_2 : <i32>
+        %cmp = cmp le, %var_2_load, %end_2 : (i32, i32) -> i1
+        yield %cmp : i1
+      }
+      
+      yield %r : i1
+    }
+
+    %tmp_add = add %var_1_load, %step : (i32, i32) -> i32
+    "emitc.assign"(%var_1, %tmp_add) : (!emitc.lvalue<i32>, i32) -> ()
+  } while {
+    %r = expression %var_1, %end_1 : (!emitc.lvalue<i32>, i32) -> i1 {
+      %var_1_load = load %var_1 : <i32>
+      %cmp = cmp le, %var_1_load, %end_1 : (i32, i32) -> i1
+      yield %cmp : i1
+    }
+    
+    yield %r : i1
+  }
+
+  return
+}
+
+
+// CPP-DEFAULT-LABEL: bool payload_do_with_empty_body(
+// CPP-DEFAULT:         int32_t [[VAL_1:v[0-9]+]], int32_t [[VAL_2:v[0-9]+]]) {
+// CPP-DEFAULT:         bool [[VAL_3:v[0-9]+]] = [[VAL_1]] < [[VAL_2]];
+// CPP-DEFAULT:         return [[VAL_3]];
+// CPP-DEFAULT:       }
+// CPP-DEFAULT:       void emitc_do_with_empty_body(
+// CPP-DEFAULT:         int32_t [[VAL_1:v[0-9]+]], int32_t [[VAL_2:v[0-9]+]]) {
+// CPP-DEFAULT:         do {
+// CPP-DEFAULT:         } while (payload_do_with_empty_body([[VAL_1]], [[VAL_2]]));
+// CPP-DEFAULT:         return;
+// CPP-DEFAULT:       }
+
+emitc.func @payload_do_with_empty_body(%1 : i32, %2 : i32) -> i1 {
+  %cmp = emitc.cmp lt, %1, %2 : (i32, i32) -> i1
+  return %cmp : i1
+}
+func.func @emitc_do_with_empty_body(%arg1 : i32, %arg2 : i32) {
+  emitc.do {
+  } while {
+    %r = emitc.expression %arg1, %arg2 : (i32, i32) -> i1 {
+      %call = emitc.call @payload_do_with_empty_body(%arg1, %arg2) : (i32, i32) -> i1
+      emitc.yield %call : i1
+    }
+    emitc.yield %r: i1
+  }
+
+  return
+}
diff --git a/mlir/test/lib/Dialect/CMakeLists.txt b/mlir/test/lib/Dialect/CMakeLists.txt
index 3b7bd9b..e31140a 100644
--- a/mlir/test/lib/Dialect/CMakeLists.txt
+++ b/mlir/test/lib/Dialect/CMakeLists.txt
@@ -12,6 +12,7 @@ add_subdirectory(Math)
 add_subdirectory(MemRef)
 add_subdirectory(Shard)
 add_subdirectory(NVGPU)
+add_subdirectory(OpenACC)
 add_subdirectory(SCF)
 add_subdirectory(Shape)
 add_subdirectory(SPIRV)
diff --git a/mlir/test/lib/Dialect/OpenACC/CMakeLists.txt b/mlir/test/lib/Dialect/OpenACC/CMakeLists.txt
new file mode 100644
index 0000000..f84055d
--- /dev/null
+++ b/mlir/test/lib/Dialect/OpenACC/CMakeLists.txt
@@ -0,0 +1,16 @@
+add_mlir_library(MLIROpenACCTestPasses
+  TestOpenACC.cpp
+  TestPointerLikeTypeInterface.cpp
+  
+  EXCLUDE_FROM_LIBMLIR
+)
+mlir_target_link_libraries(MLIROpenACCTestPasses PUBLIC
+  MLIRIR
+  MLIRArithDialect
+  MLIRFuncDialect
+  MLIRMemRefDialect
+  MLIROpenACCDialect
+  MLIRPass
+  MLIRSupport
+)
+
diff --git a/mlir/test/lib/Dialect/OpenACC/TestOpenACC.cpp b/mlir/test/lib/Dialect/OpenACC/TestOpenACC.cpp
new file mode 100644
index 0000000..9886240
--- /dev/null
+++ b/mlir/test/lib/Dialect/OpenACC/TestOpenACC.cpp
@@ -0,0 +1,23 @@
+//===- TestOpenACC.cpp - OpenACC Test Registration ------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains unified registration for all OpenACC test passes.
+//
+//===----------------------------------------------------------------------===//
+
+namespace mlir {
+namespace test {
+
+// Forward declarations of individual test pass registration functions
+void registerTestPointerLikeTypeInterfacePass();
+
+// Unified registration function for all OpenACC tests
+void registerTestOpenACC() { registerTestPointerLikeTypeInterfacePass(); }
+
+} // namespace test
+} // namespace mlir
diff --git a/mlir/test/lib/Dialect/OpenACC/TestPointerLikeTypeInterface.cpp b/mlir/test/lib/Dialect/OpenACC/TestPointerLikeTypeInterface.cpp
new file mode 100644
index 0000000..85f9283
--- /dev/null
+++ b/mlir/test/lib/Dialect/OpenACC/TestPointerLikeTypeInterface.cpp
@@ -0,0 +1,305 @@
+//===- TestPointerLikeTypeInterface.cpp - Test PointerLikeType interface -===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains test passes for testing the OpenACC PointerLikeType
+// interface methods.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/Dialect/OpenACC/OpenACC.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/Pass/Pass.h"
+#include "llvm/Support/CommandLine.h"
+
+using namespace mlir;
+using namespace mlir::acc;
+
+namespace {
+
+struct OperationTracker : public OpBuilder::Listener {
+  SmallVector<Operation *> insertedOps;
+
+  void notifyOperationInserted(Operation *op,
+                               OpBuilder::InsertPoint previous) override {
+    insertedOps.push_back(op);
+  }
+};
+
+struct TestPointerLikeTypeInterfacePass
+    : public PassWrapper<TestPointerLikeTypeInterfacePass,
+                         OperationPass<func::FuncOp>> {
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(TestPointerLikeTypeInterfacePass)
+
+  TestPointerLikeTypeInterfacePass() = default;
+  TestPointerLikeTypeInterfacePass(const TestPointerLikeTypeInterfacePass &pass)
+      : PassWrapper(pass) {
+    testMode = pass.testMode;
+  }
+
+  Pass::Option<std::string> testMode{
+      *this, "test-mode",
+      llvm::cl::desc("Test mode: walk, alloc, copy, or free"),
+      llvm::cl::init("walk")};
+
+  StringRef getArgument() const override {
+    return "test-acc-pointer-like-interface";
+  }
+
+  StringRef getDescription() const override {
+    return "Test OpenACC PointerLikeType interface methods on any implementing "
+           "type";
+  }
+
+  void runOnOperation() override;
+
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry.insert<acc::OpenACCDialect>();
+    registry.insert<arith::ArithDialect>();
+    registry.insert<memref::MemRefDialect>();
+  }
+
+private:
+  void walkAndPrint();
+  void testGenAllocate(Operation *op, Value result, PointerLikeType pointerType,
+                       OpBuilder &builder);
+  void testGenFree(Operation *op, Value result, PointerLikeType pointerType,
+                   OpBuilder &builder);
+  void testGenCopy(Operation *srcOp, Operation *destOp, Value srcResult,
+                   Value destResult, PointerLikeType pointerType,
+                   OpBuilder &builder);
+
+  struct PointerCandidate {
+    Operation *op;
+    Value result;
+    PointerLikeType pointerType;
+  };
+};
+
+void TestPointerLikeTypeInterfacePass::runOnOperation() {
+  if (testMode == "walk") {
+    walkAndPrint();
+    return;
+  }
+
+  auto func = getOperation();
+  OpBuilder builder(&getContext());
+
+  if (testMode == "alloc" || testMode == "free") {
+    // Collect all candidates first
+    SmallVector<PointerCandidate> candidates;
+    func.walk([&](Operation *op) {
+      if (op->hasAttr("test.ptr")) {
+        for (auto result : op->getResults()) {
+          if (isa<PointerLikeType>(result.getType())) {
+            candidates.push_back(
+                {op, result, cast<PointerLikeType>(result.getType())});
+            break; // Only take the first PointerLikeType result
+          }
+        }
+      }
+    });
+
+    // Now test all candidates
+    for (const auto &candidate : candidates) {
+      if (testMode == "alloc")
+        testGenAllocate(candidate.op, candidate.result, candidate.pointerType,
+                        builder);
+      else if (testMode == "free")
+        testGenFree(candidate.op, candidate.result, candidate.pointerType,
+                    builder);
+    }
+  } else if (testMode == "copy") {
+    // Collect all source and destination candidates
+    SmallVector<PointerCandidate> sources, destinations;
+
+    func.walk([&](Operation *op) {
+      if (op->hasAttr("test.src_ptr")) {
+        for (auto result : op->getResults()) {
+          if (isa<PointerLikeType>(result.getType())) {
+            sources.push_back(
+                {op, result, cast<PointerLikeType>(result.getType())});
+            break;
+          }
+        }
+      }
+      if (op->hasAttr("test.dest_ptr")) {
+        for (auto result : op->getResults()) {
+          if (isa<PointerLikeType>(result.getType())) {
+            destinations.push_back(
+                {op, result, cast<PointerLikeType>(result.getType())});
+            break;
+          }
+        }
+      }
+    });
+
+    // Try copying from each source to each destination
+    for (const auto &src : sources)
+      for (const auto &dest : destinations)
+        testGenCopy(src.op, dest.op, src.result, dest.result, src.pointerType,
+                    builder);
+  }
+}
+
+void TestPointerLikeTypeInterfacePass::walkAndPrint() {
+  auto func = getOperation();
+
+  func.walk([&](Operation *op) {
+    // Look for operations marked with "test.ptr", "test.src_ptr", or
+    // "test.dest_ptr"
+    if (op->hasAttr("test.ptr") || op->hasAttr("test.src_ptr") ||
+        op->hasAttr("test.dest_ptr")) {
+      llvm::errs() << "Operation: ";
+      op->print(llvm::errs());
+      llvm::errs() << "\n";
+
+      // Check each result to see if it's a PointerLikeType
+      for (auto result : op->getResults()) {
+        if (isa<PointerLikeType>(result.getType())) {
+          llvm::errs() << "  Result " << result.getResultNumber()
+                       << " is PointerLikeType: ";
+          result.getType().print(llvm::errs());
+          llvm::errs() << "\n";
+        } else {
+          llvm::errs() << "  Result " << result.getResultNumber()
+                       << " is NOT PointerLikeType: ";
+          result.getType().print(llvm::errs());
+          llvm::errs() << "\n";
+        }
+      }
+
+      if (op->getNumResults() == 0)
+        llvm::errs() << "  Operation has no results\n";
+
+      llvm::errs() << "\n";
+    }
+  });
+}
+
+void TestPointerLikeTypeInterfacePass::testGenAllocate(
+    Operation *op, Value result, PointerLikeType pointerType,
+    OpBuilder &builder) {
+  Location loc = op->getLoc();
+
+  // Create a new builder with the listener and set insertion point
+  OperationTracker tracker;
+  OpBuilder newBuilder(op->getContext());
+  newBuilder.setListener(&tracker);
+  newBuilder.setInsertionPointAfter(op);
+
+  // Call the genAllocate API
+  Value allocRes = pointerType.genAllocate(newBuilder, loc, "test_alloc",
+                                           result.getType(), result);
+
+  if (allocRes) {
+    llvm::errs() << "Successfully generated alloc for operation: ";
+    op->print(llvm::errs());
+    llvm::errs() << "\n";
+
+    // Print all operations that were inserted
+    for (Operation *insertedOp : tracker.insertedOps) {
+      llvm::errs() << "\tGenerated: ";
+      insertedOp->print(llvm::errs());
+      llvm::errs() << "\n";
+    }
+  } else {
+    llvm::errs() << "Failed to generate alloc for operation: ";
+    op->print(llvm::errs());
+    llvm::errs() << "\n";
+  }
+}
+
+void TestPointerLikeTypeInterfacePass::testGenFree(Operation *op, Value result,
+                                                   PointerLikeType pointerType,
+                                                   OpBuilder &builder) {
+  Location loc = op->getLoc();
+
+  // Create a new builder with the listener and set insertion point
+  OperationTracker tracker;
+  OpBuilder newBuilder(op->getContext());
+  newBuilder.setListener(&tracker);
+  newBuilder.setInsertionPointAfter(op);
+
+  // Call the genFree API
+  auto typedResult = cast<TypedValue<PointerLikeType>>(result);
+  bool success =
+      pointerType.genFree(newBuilder, loc, typedResult, result.getType());
+
+  if (success) {
+    llvm::errs() << "Successfully generated free for operation: ";
+    op->print(llvm::errs());
+    llvm::errs() << "\n";
+
+    // Print all operations that were inserted
+    for (Operation *insertedOp : tracker.insertedOps) {
+      llvm::errs() << "\tGenerated: ";
+      insertedOp->print(llvm::errs());
+      llvm::errs() << "\n";
+    }
+  } else {
+    llvm::errs() << "Failed to generate free for operation: ";
+    op->print(llvm::errs());
+    llvm::errs() << "\n";
+  }
+}
+
+void TestPointerLikeTypeInterfacePass::testGenCopy(
+    Operation *srcOp, Operation *destOp, Value srcResult, Value destResult,
+    PointerLikeType pointerType, OpBuilder &builder) {
+  Location loc = destOp->getLoc();
+
+  // Create a new builder with the listener and set insertion point
+  OperationTracker tracker;
+  OpBuilder newBuilder(destOp->getContext());
+  newBuilder.setListener(&tracker);
+  newBuilder.setInsertionPointAfter(destOp);
+
+  // Call the genCopy API with the provided source and destination
+  auto typedSrc = cast<TypedValue<PointerLikeType>>(srcResult);
+  auto typedDest = cast<TypedValue<PointerLikeType>>(destResult);
+  bool success = pointerType.genCopy(newBuilder, loc, typedDest, typedSrc,
+                                     srcResult.getType());
+
+  if (success) {
+    llvm::errs() << "Successfully generated copy from source: ";
+    srcOp->print(llvm::errs());
+    llvm::errs() << " to destination: ";
+    destOp->print(llvm::errs());
+    llvm::errs() << "\n";
+
+    // Print all operations that were inserted
+    for (Operation *insertedOp : tracker.insertedOps) {
+      llvm::errs() << "\tGenerated: ";
+      insertedOp->print(llvm::errs());
+      llvm::errs() << "\n";
+    }
+  } else {
+    llvm::errs() << "Failed to generate copy from source: ";
+    srcOp->print(llvm::errs());
+    llvm::errs() << " to destination: ";
+    destOp->print(llvm::errs());
+    llvm::errs() << "\n";
+  }
+}
+
+} // namespace
+
+//===----------------------------------------------------------------------===//
+// Pass Registration
+//===----------------------------------------------------------------------===//
+
+namespace mlir {
+namespace test {
+void registerTestPointerLikeTypeInterfacePass() {
+  PassRegistration<TestPointerLikeTypeInterfacePass>();
+}
+} // namespace test
+} // namespace mlir
diff --git a/mlir/tools/mlir-opt/CMakeLists.txt b/mlir/tools/mlir-opt/CMakeLists.txt
index 7cc6e78..c607ccf 100644
--- a/mlir/tools/mlir-opt/CMakeLists.txt
+++ b/mlir/tools/mlir-opt/CMakeLists.txt
@@ -28,6 +28,7 @@ if(MLIR_INCLUDE_TESTS)
     MLIRTestMemRefToLLVMWithTransforms
     MLIRShardTest
     MLIRNVGPUTestPasses
+    MLIROpenACCTestPasses
     MLIRSCFTestPasses
     MLIRShapeTestPasses
     MLIRSPIRVTestPasses
diff --git a/mlir/tools/mlir-opt/mlir-opt.cpp b/mlir/tools/mlir-opt/mlir-opt.cpp
index e4620c0..6432fae 100644
--- a/mlir/tools/mlir-opt/mlir-opt.cpp
+++ b/mlir/tools/mlir-opt/mlir-opt.cpp
@@ -135,6 +135,7 @@ void registerTestShardSimplificationsPass();
 void registerTestMultiBuffering();
 void registerTestNextAccessPass();
 void registerTestNVGPULowerings();
+void registerTestOpenACC();
 void registerTestOneShotModuleBufferizePass();
 void registerTestOpaqueLoc();
 void registerTestOpLoweringPasses();
@@ -282,6 +283,7 @@ void registerTestPasses() {
   mlir::test::registerTestMultiBuffering();
   mlir::test::registerTestNextAccessPass();
   mlir::test::registerTestNVGPULowerings();
+  mlir::test::registerTestOpenACC();
   mlir::test::registerTestOneShotModuleBufferizePass();
   mlir::test::registerTestOpaqueLoc();
   mlir::test::registerTestOpLoweringPasses();
diff --git a/openmp/runtime/README.txt b/openmp/runtime/README.txt
index ddd8b0e..60a0c70 100644
--- a/openmp/runtime/README.txt
+++ b/openmp/runtime/README.txt
@@ -52,6 +52,7 @@ Architectures Supported
 * Aarch64 (64-bit ARM) architecture
 * IBM(R) Power architecture (big endian)
 * IBM(R) Power architecture (little endian)
+* IBM(R) Z (s390x) architecture
 * MIPS and MIPS64 architecture
 * RISCV64 architecture
 * LoongArch64 architecture